From 3c54d94191b0888af3712f7c330943068604cab8 Mon Sep 17 00:00:00 2001 From: Bad Diode Date: Tue, 23 Jan 2024 11:21:14 +0100 Subject: Add improved renderer routines with DMA option --- src/gba/utils.s | 82 +++++++++ src/profiling.c | 5 +- src/renderer_m0.c | 534 ++++++++++++++++++++++++++++-------------------------- 3 files changed, 361 insertions(+), 260 deletions(-) create mode 100755 src/gba/utils.s (limited to 'src') diff --git a/src/gba/utils.s b/src/gba/utils.s new file mode 100755 index 0000000..d70d7ef --- /dev/null +++ b/src/gba/utils.s @@ -0,0 +1,82 @@ +.file "utils.s" +.section .iwram, "ax", %progbits +.arm +.align + +@ Efficient memcpy32 function (borrowed from TONC). It uses a two step +@ approach. It tries to copy 8 u32 chunks at a time with the ldm and stm +@ instructions and then copy the remainder if there are less than 8 chunks +@ left. +@ +@ r0: destination address +@ r1: source address +@ r2: number of 32bit chunks to copy +@ +.global copy32 +copy32: + cmp r2, #0 + beq .copy32_end + + and r12, r2, #7 @ r12 = r2 % 8 + movs r2, r2, lsr #3 @ r2 = r2 / 8 + beq .Lcopy32_residual + + @ Copy 8 32B chunks at a time + push {r4-r10} +.Lcopy32_chunks: + ldmia r1!, {r3-r10} + stmia r0!, {r3-r10} + subs r2, r2, #1 + bhi .Lcopy32_chunks + pop {r4-r10} + + @ Copy residual 32B chunks (0-7) +.Lcopy32_residual: + subs r12, r12, #1 + ldrhs r3, [r1], #4 + strhs r3, [r0], #4 + bhi .Lcopy32_residual + +.copy32_end: + bx lr + + +@ Efficient memset32 function (borrowed from TONC). It uses a two step +@ approach. Uses the same stmia approach from memcpy32 but, no need for ldmia +@ +@ r0: destination address +@ r1: u32 value to set +@ r2: number of 32bit chunks to set +@ +.global set32 +set32: + cmp r2, #0 + beq .set32_end + + and r12, r2, #7 @ r12 = r2 % 8 + movs r2, r2, lsr #3 @ r2 = r2 / 8 + beq .Lset32_residual + + @ Set 8 32B chunks at a time + push {r4-r9} + mov r3, r1 + mov r4, r1 + mov r5, r1 + mov r6, r1 + mov r7, r1 + mov r8, r1 + mov r9, r1 +.Lset32_chunks: + stmia r0!, {r1, r3-r9} + subs r2, r2, #1 + bhi .Lset32_chunks + pop {r4-r9} + + @ Set residual 32B chunks (0-7) +.Lset32_residual: + subs r12, r12, #1 + strhs r1, [r0], #4 + bhi .Lset32_residual + +.set32_end: + bx lr diff --git a/src/profiling.c b/src/profiling.c index 6b073ed..07f4bbf 100644 --- a/src/profiling.c +++ b/src/profiling.c @@ -9,7 +9,7 @@ #if PROF_ENABLE > 0 #ifndef PROF_RESET_MINMAX -#define PROF_RESET_MINMAX false +#define PROF_RESET_MINMAX true #endif // Maximum number of profiling to monitor. @@ -74,7 +74,7 @@ bool prof_show = true; prof_frame_avg, \ (u32)((u64)280896 * 60 / (prof_frame_avg + 1)));\ txt_drawf_small("MAX: %.9l/%l", 8 * 19, 0, COL_FG, \ - prof_frame_time_max,280896);\ + prof_frame_time_max, 280896);\ for (size_t idx = 0; idx < PROF_NUM; idx++) { \ txt_drawf_small("%s %.9l (%.9l %.9l) %08x:%08x", 0, 8 * (idx + 1), COL_FG, \ prof_type_str[idx], \ @@ -91,6 +91,7 @@ bool prof_show = true; if (prof_reset_minmax) { \ prof_min[idx] = -1; \ prof_max[idx] = 0; \ + prof_frame_time_max = 0; \ } \ prof_times[idx] = 0; \ prof_count[idx] = 0; \ diff --git a/src/renderer_m0.c b/src/renderer_m0.c index 133b39d..064cc0e 100644 --- a/src/renderer_m0.c +++ b/src/renderer_m0.c @@ -13,8 +13,9 @@ // #define SUBPIXEL_LINES 1 -#define DEC_BIG_LUT 1 #define FLIP_TYPE 3 +#define DISABLE_BOUNDCHECK_SCREEN 0 +#define NO_DMA 0 // Front/back buffers for double buffering. #define BUF_0 ((u32*)(MEM_VRAM)) @@ -38,7 +39,7 @@ static u32 dirty_tiles[21] = {0}; // Boundchecks can be disable at compile time but this will not always improve // the performance and can in fact make it worse. It is possible that this is // due to some aliasing optimizations but not sure at this moment. -#ifdef DISABLE_BOUNDCHECK_SCREEN +#if DISABLE_BOUNDCHECK_SCREEN > 0 #define BOUNDCHECK_SCREEN(X,Y) #else #define BOUNDCHECK_SCREEN(X,Y) if ((X) >= SCREEN_WIDTH || (Y) >= SCREEN_HEIGHT) return; @@ -66,7 +67,11 @@ IWRAM_CODE void screen_fill(u8 clr) { // We have to make sure we leave the last tile blank to use as alpha channel // when moving the BG during double buffering. +#if NO_DMA == 0 dma_fill(backbuf, 0x11111111 * clr, KB(20) - 32, 3); +#else + set32(backbuf, 0x11111111 * clr, (KB(20) / 4) - 8); +#endif redraw(); } @@ -85,8 +90,8 @@ draw_pixel(size_t x, size_t y, u8 clr) { // Update backbuffer. size_t shift = start_col * sizeof(u32); u32 mask = 0xF << shift; - u32 row = clr << shift; - *dst = (*dst & ~mask) | row; + u32 color = clr << shift; + *dst = (*dst & ~mask) | color; dirty_tiles[tile_y] |= 1 << tile_x; } @@ -117,21 +122,21 @@ draw_hline(size_t x0, size_t x1, size_t y0, u8 clr) { size_t shift_left = start_col * 4; size_t shift_right = (7 - end_col) * 4; u32 mask = (0xFFFFFFFF >> shift_right) & (0xFFFFFFFF << shift_left); - u32 row = (0x11111111 * clr) & mask; - *dst = (*dst & ~mask) | row; + u32 color = (0x11111111 * clr) & mask; + *dst = (*dst & ~mask) | color; } else { size_t shift_left = start_col * 4; size_t shift_right = (7 - end_col) * 4; u32 mask = 0xFFFFFFFF; - u32 row = 0x11111111 * clr; - *dst = (*dst & ~(mask << shift_left)) | (row << shift_left); + u32 color = 0x11111111 * clr; + *dst = (*dst & ~(mask << shift_left)) | (color << shift_left); dst += 8; for (size_t i = 1; i < dtx; i++) { dirty |= (1 << (tile_x0 + i)); - *dst = row; + *dst = color; dst += 8; } - *dst = (*dst & ~(mask >> shift_right)) | (row >> shift_right); + *dst = (*dst & ~(mask >> shift_right)) | (color >> shift_right); } dirty_tiles[tile_y] |= dirty; } @@ -156,26 +161,26 @@ draw_vline(size_t x0, size_t y0, size_t y1, u8 clr) { u32 *dst = &backbuf[start_row0 + (tile_x + tile_y * 32) * 8]; u32 mask = 0x0000000F << shift_left; - u32 row = (0x11111111 * clr) & mask; + u32 color = clr << shift_left; u32 dty = tile_y1 - tile_y0; if (dty < 1) { for (size_t i = 0; i <= (y1 - y0); i++, dst++) { - dst[0] = (dst[0] & ~mask) | row; + dst[0] = (dst[0] & ~mask) | color; } } else { for (size_t i = 0; i < (8 - start_row0); i++, dst++) { - dst[0] = (dst[0] & ~mask) | row; + dst[0] = (dst[0] & ~mask) | color; } dst += 8 * 31; for (size_t j = 1; j < dty; j++) { dirty_tiles[tile_y0 + j] |= dirty; for (size_t i = 0; i < 8; i++, dst++) { - dst[0] = (dst[0] & ~mask) | row; + dst[0] = (dst[0] & ~mask) | color; } dst += 8 * 31; } for (size_t i = 0; i <= start_row1; i++, dst++) { - dst[0] = (dst[0] & ~mask) | row; + dst[0] = (dst[0] & ~mask) | color; } } dirty_tiles[tile_y0] |= dirty; @@ -313,8 +318,8 @@ draw_filled_rect(size_t x0, size_t y0, size_t x1, size_t y1, u8 clr) { MAYBE_SWAP(x0, x1); MAYBE_SWAP(y0, y1); - // Special condition. If the screen is to be completely filled, use the DMA - // instead. + // Special condition. If the screen is to be completely filled, use the + // full clearing functions instead. if (x0 == 0 && x1 >= (SCREEN_WIDTH - 1) && y0 == 0 && y1 >= (SCREEN_HEIGHT - 1)) { screen_fill(clr); return; @@ -326,161 +331,135 @@ draw_filled_rect(size_t x0, size_t y0, size_t x1, size_t y1, u8 clr) { } // -// Sprites (chr/icn). +// Sprites (1bpp). // -#if DEC_BIG_LUT == 1 -static u32 dec_byte_flip_x[256] = { - 0x00000000, 0x00000001, 0x00000010, 0x00000011, 0x00000100, - 0x00000101, 0x00000110, 0x00000111, 0x00001000, 0x00001001, - 0x00001010, 0x00001011, 0x00001100, 0x00001101, 0x00001110, - 0x00001111, 0x00010000, 0x00010001, 0x00010010, 0x00010011, - 0x00010100, 0x00010101, 0x00010110, 0x00010111, 0x00011000, - 0x00011001, 0x00011010, 0x00011011, 0x00011100, 0x00011101, - 0x00011110, 0x00011111, 0x00100000, 0x00100001, 0x00100010, - 0x00100011, 0x00100100, 0x00100101, 0x00100110, 0x00100111, - 0x00101000, 0x00101001, 0x00101010, 0x00101011, 0x00101100, - 0x00101101, 0x00101110, 0x00101111, 0x00110000, 0x00110001, - 0x00110010, 0x00110011, 0x00110100, 0x00110101, 0x00110110, - 0x00110111, 0x00111000, 0x00111001, 0x00111010, 0x00111011, - 0x00111100, 0x00111101, 0x00111110, 0x00111111, 0x01000000, - 0x01000001, 0x01000010, 0x01000011, 0x01000100, 0x01000101, - 0x01000110, 0x01000111, 0x01001000, 0x01001001, 0x01001010, - 0x01001011, 0x01001100, 0x01001101, 0x01001110, 0x01001111, - 0x01010000, 0x01010001, 0x01010010, 0x01010011, 0x01010100, - 0x01010101, 0x01010110, 0x01010111, 0x01011000, 0x01011001, - 0x01011010, 0x01011011, 0x01011100, 0x01011101, 0x01011110, - 0x01011111, 0x01100000, 0x01100001, 0x01100010, 0x01100011, - 0x01100100, 0x01100101, 0x01100110, 0x01100111, 0x01101000, - 0x01101001, 0x01101010, 0x01101011, 0x01101100, 0x01101101, - 0x01101110, 0x01101111, 0x01110000, 0x01110001, 0x01110010, - 0x01110011, 0x01110100, 0x01110101, 0x01110110, 0x01110111, - 0x01111000, 0x01111001, 0x01111010, 0x01111011, 0x01111100, - 0x01111101, 0x01111110, 0x01111111, 0x10000000, 0x10000001, - 0x10000010, 0x10000011, 0x10000100, 0x10000101, 0x10000110, - 0x10000111, 0x10001000, 0x10001001, 0x10001010, 0x10001011, - 0x10001100, 0x10001101, 0x10001110, 0x10001111, 0x10010000, - 0x10010001, 0x10010010, 0x10010011, 0x10010100, 0x10010101, - 0x10010110, 0x10010111, 0x10011000, 0x10011001, 0x10011010, - 0x10011011, 0x10011100, 0x10011101, 0x10011110, 0x10011111, - 0x10100000, 0x10100001, 0x10100010, 0x10100011, 0x10100100, - 0x10100101, 0x10100110, 0x10100111, 0x10101000, 0x10101001, - 0x10101010, 0x10101011, 0x10101100, 0x10101101, 0x10101110, - 0x10101111, 0x10110000, 0x10110001, 0x10110010, 0x10110011, - 0x10110100, 0x10110101, 0x10110110, 0x10110111, 0x10111000, - 0x10111001, 0x10111010, 0x10111011, 0x10111100, 0x10111101, - 0x10111110, 0x10111111, 0x11000000, 0x11000001, 0x11000010, - 0x11000011, 0x11000100, 0x11000101, 0x11000110, 0x11000111, - 0x11001000, 0x11001001, 0x11001010, 0x11001011, 0x11001100, - 0x11001101, 0x11001110, 0x11001111, 0x11010000, 0x11010001, - 0x11010010, 0x11010011, 0x11010100, 0x11010101, 0x11010110, - 0x11010111, 0x11011000, 0x11011001, 0x11011010, 0x11011011, - 0x11011100, 0x11011101, 0x11011110, 0x11011111, 0x11100000, - 0x11100001, 0x11100010, 0x11100011, 0x11100100, 0x11100101, - 0x11100110, 0x11100111, 0x11101000, 0x11101001, 0x11101010, - 0x11101011, 0x11101100, 0x11101101, 0x11101110, 0x11101111, - 0x11110000, 0x11110001, 0x11110010, 0x11110011, 0x11110100, - 0x11110101, 0x11110110, 0x11110111, 0x11111000, 0x11111001, - 0x11111010, 0x11111011, 0x11111100, 0x11111101, 0x11111110, - 0x11111111 +static u32 lut_1bpp_mask[256] = { + 0x00000000, 0xf0000000, 0x0f000000, 0xff000000, 0x00f00000, + 0xf0f00000, 0x0ff00000, 0xfff00000, 0x000f0000, 0xf00f0000, + 0x0f0f0000, 0xff0f0000, 0x00ff0000, 0xf0ff0000, 0x0fff0000, + 0xffff0000, 0x0000f000, 0xf000f000, 0x0f00f000, 0xff00f000, + 0x00f0f000, 0xf0f0f000, 0x0ff0f000, 0xfff0f000, 0x000ff000, + 0xf00ff000, 0x0f0ff000, 0xff0ff000, 0x00fff000, 0xf0fff000, + 0x0ffff000, 0xfffff000, 0x00000f00, 0xf0000f00, 0x0f000f00, + 0xff000f00, 0x00f00f00, 0xf0f00f00, 0x0ff00f00, 0xfff00f00, + 0x000f0f00, 0xf00f0f00, 0x0f0f0f00, 0xff0f0f00, 0x00ff0f00, + 0xf0ff0f00, 0x0fff0f00, 0xffff0f00, 0x0000ff00, 0xf000ff00, + 0x0f00ff00, 0xff00ff00, 0x00f0ff00, 0xf0f0ff00, 0x0ff0ff00, + 0xfff0ff00, 0x000fff00, 0xf00fff00, 0x0f0fff00, 0xff0fff00, + 0x00ffff00, 0xf0ffff00, 0x0fffff00, 0xffffff00, 0x000000f0, + 0xf00000f0, 0x0f0000f0, 0xff0000f0, 0x00f000f0, 0xf0f000f0, + 0x0ff000f0, 0xfff000f0, 0x000f00f0, 0xf00f00f0, 0x0f0f00f0, + 0xff0f00f0, 0x00ff00f0, 0xf0ff00f0, 0x0fff00f0, 0xffff00f0, + 0x0000f0f0, 0xf000f0f0, 0x0f00f0f0, 0xff00f0f0, 0x00f0f0f0, + 0xf0f0f0f0, 0x0ff0f0f0, 0xfff0f0f0, 0x000ff0f0, 0xf00ff0f0, + 0x0f0ff0f0, 0xff0ff0f0, 0x00fff0f0, 0xf0fff0f0, 0x0ffff0f0, + 0xfffff0f0, 0x00000ff0, 0xf0000ff0, 0x0f000ff0, 0xff000ff0, + 0x00f00ff0, 0xf0f00ff0, 0x0ff00ff0, 0xfff00ff0, 0x000f0ff0, + 0xf00f0ff0, 0x0f0f0ff0, 0xff0f0ff0, 0x00ff0ff0, 0xf0ff0ff0, + 0x0fff0ff0, 0xffff0ff0, 0x0000fff0, 0xf000fff0, 0x0f00fff0, + 0xff00fff0, 0x00f0fff0, 0xf0f0fff0, 0x0ff0fff0, 0xfff0fff0, + 0x000ffff0, 0xf00ffff0, 0x0f0ffff0, 0xff0ffff0, 0x00fffff0, + 0xf0fffff0, 0x0ffffff0, 0xfffffff0, 0x0000000f, 0xf000000f, + 0x0f00000f, 0xff00000f, 0x00f0000f, 0xf0f0000f, 0x0ff0000f, + 0xfff0000f, 0x000f000f, 0xf00f000f, 0x0f0f000f, 0xff0f000f, + 0x00ff000f, 0xf0ff000f, 0x0fff000f, 0xffff000f, 0x0000f00f, + 0xf000f00f, 0x0f00f00f, 0xff00f00f, 0x00f0f00f, 0xf0f0f00f, + 0x0ff0f00f, 0xfff0f00f, 0x000ff00f, 0xf00ff00f, 0x0f0ff00f, + 0xff0ff00f, 0x00fff00f, 0xf0fff00f, 0x0ffff00f, 0xfffff00f, + 0x00000f0f, 0xf0000f0f, 0x0f000f0f, 0xff000f0f, 0x00f00f0f, + 0xf0f00f0f, 0x0ff00f0f, 0xfff00f0f, 0x000f0f0f, 0xf00f0f0f, + 0x0f0f0f0f, 0xff0f0f0f, 0x00ff0f0f, 0xf0ff0f0f, 0x0fff0f0f, + 0xffff0f0f, 0x0000ff0f, 0xf000ff0f, 0x0f00ff0f, 0xff00ff0f, + 0x00f0ff0f, 0xf0f0ff0f, 0x0ff0ff0f, 0xfff0ff0f, 0x000fff0f, + 0xf00fff0f, 0x0f0fff0f, 0xff0fff0f, 0x00ffff0f, 0xf0ffff0f, + 0x0fffff0f, 0xffffff0f, 0x000000ff, 0xf00000ff, 0x0f0000ff, + 0xff0000ff, 0x00f000ff, 0xf0f000ff, 0x0ff000ff, 0xfff000ff, + 0x000f00ff, 0xf00f00ff, 0x0f0f00ff, 0xff0f00ff, 0x00ff00ff, + 0xf0ff00ff, 0x0fff00ff, 0xffff00ff, 0x0000f0ff, 0xf000f0ff, + 0x0f00f0ff, 0xff00f0ff, 0x00f0f0ff, 0xf0f0f0ff, 0x0ff0f0ff, + 0xfff0f0ff, 0x000ff0ff, 0xf00ff0ff, 0x0f0ff0ff, 0xff0ff0ff, + 0x00fff0ff, 0xf0fff0ff, 0x0ffff0ff, 0xfffff0ff, 0x00000fff, + 0xf0000fff, 0x0f000fff, 0xff000fff, 0x00f00fff, 0xf0f00fff, + 0x0ff00fff, 0xfff00fff, 0x000f0fff, 0xf00f0fff, 0x0f0f0fff, + 0xff0f0fff, 0x00ff0fff, 0xf0ff0fff, 0x0fff0fff, 0xffff0fff, + 0x0000ffff, 0xf000ffff, 0x0f00ffff, 0xff00ffff, 0x00f0ffff, + 0xf0f0ffff, 0x0ff0ffff, 0xfff0ffff, 0x000fffff, 0xf00fffff, + 0x0f0fffff, 0xff0fffff, 0x00ffffff, 0xf0ffffff, 0x0fffffff, + 0xffffffff }; -static u32 dec_byte[256] = { - 0x00000000, 0x10000000, 0x01000000, 0x11000000, 0x00100000, - 0x10100000, 0x01100000, 0x11100000, 0x00010000, 0x10010000, - 0x01010000, 0x11010000, 0x00110000, 0x10110000, 0x01110000, - 0x11110000, 0x00001000, 0x10001000, 0x01001000, 0x11001000, - 0x00101000, 0x10101000, 0x01101000, 0x11101000, 0x00011000, - 0x10011000, 0x01011000, 0x11011000, 0x00111000, 0x10111000, - 0x01111000, 0x11111000, 0x00000100, 0x10000100, 0x01000100, - 0x11000100, 0x00100100, 0x10100100, 0x01100100, 0x11100100, - 0x00010100, 0x10010100, 0x01010100, 0x11010100, 0x00110100, - 0x10110100, 0x01110100, 0x11110100, 0x00001100, 0x10001100, - 0x01001100, 0x11001100, 0x00101100, 0x10101100, 0x01101100, - 0x11101100, 0x00011100, 0x10011100, 0x01011100, 0x11011100, - 0x00111100, 0x10111100, 0x01111100, 0x11111100, 0x00000010, - 0x10000010, 0x01000010, 0x11000010, 0x00100010, 0x10100010, - 0x01100010, 0x11100010, 0x00010010, 0x10010010, 0x01010010, - 0x11010010, 0x00110010, 0x10110010, 0x01110010, 0x11110010, - 0x00001010, 0x10001010, 0x01001010, 0x11001010, 0x00101010, - 0x10101010, 0x01101010, 0x11101010, 0x00011010, 0x10011010, - 0x01011010, 0x11011010, 0x00111010, 0x10111010, 0x01111010, - 0x11111010, 0x00000110, 0x10000110, 0x01000110, 0x11000110, - 0x00100110, 0x10100110, 0x01100110, 0x11100110, 0x00010110, - 0x10010110, 0x01010110, 0x11010110, 0x00110110, 0x10110110, - 0x01110110, 0x11110110, 0x00001110, 0x10001110, 0x01001110, - 0x11001110, 0x00101110, 0x10101110, 0x01101110, 0x11101110, - 0x00011110, 0x10011110, 0x01011110, 0x11011110, 0x00111110, - 0x10111110, 0x01111110, 0x11111110, 0x00000001, 0x10000001, - 0x01000001, 0x11000001, 0x00100001, 0x10100001, 0x01100001, - 0x11100001, 0x00010001, 0x10010001, 0x01010001, 0x11010001, - 0x00110001, 0x10110001, 0x01110001, 0x11110001, 0x00001001, - 0x10001001, 0x01001001, 0x11001001, 0x00101001, 0x10101001, - 0x01101001, 0x11101001, 0x00011001, 0x10011001, 0x01011001, - 0x11011001, 0x00111001, 0x10111001, 0x01111001, 0x11111001, - 0x00000101, 0x10000101, 0x01000101, 0x11000101, 0x00100101, - 0x10100101, 0x01100101, 0x11100101, 0x00010101, 0x10010101, - 0x01010101, 0x11010101, 0x00110101, 0x10110101, 0x01110101, - 0x11110101, 0x00001101, 0x10001101, 0x01001101, 0x11001101, - 0x00101101, 0x10101101, 0x01101101, 0x11101101, 0x00011101, - 0x10011101, 0x01011101, 0x11011101, 0x00111101, 0x10111101, - 0x01111101, 0x11111101, 0x00000011, 0x10000011, 0x01000011, - 0x11000011, 0x00100011, 0x10100011, 0x01100011, 0x11100011, - 0x00010011, 0x10010011, 0x01010011, 0x11010011, 0x00110011, - 0x10110011, 0x01110011, 0x11110011, 0x00001011, 0x10001011, - 0x01001011, 0x11001011, 0x00101011, 0x10101011, 0x01101011, - 0x11101011, 0x00011011, 0x10011011, 0x01011011, 0x11011011, - 0x00111011, 0x10111011, 0x01111011, 0x11111011, 0x00000111, - 0x10000111, 0x01000111, 0x11000111, 0x00100111, 0x10100111, - 0x01100111, 0x11100111, 0x00010111, 0x10010111, 0x01010111, - 0x11010111, 0x00110111, 0x10110111, 0x01110111, 0x11110111, - 0x00001111, 0x10001111, 0x01001111, 0x11001111, 0x00101111, - 0x10101111, 0x01101111, 0x11101111, 0x00011111, 0x10011111, - 0x01011111, 0x11011111, 0x00111111, 0x10111111, 0x01111111, - 0x11111111 +static u32 lut_1bpp_mask_flip_x[256] = { + 0x00000000, 0x0000000f, 0x000000f0, 0x000000ff, 0x00000f00, + 0x00000f0f, 0x00000ff0, 0x00000fff, 0x0000f000, 0x0000f00f, + 0x0000f0f0, 0x0000f0ff, 0x0000ff00, 0x0000ff0f, 0x0000fff0, + 0x0000ffff, 0x000f0000, 0x000f000f, 0x000f00f0, 0x000f00ff, + 0x000f0f00, 0x000f0f0f, 0x000f0ff0, 0x000f0fff, 0x000ff000, + 0x000ff00f, 0x000ff0f0, 0x000ff0ff, 0x000fff00, 0x000fff0f, + 0x000ffff0, 0x000fffff, 0x00f00000, 0x00f0000f, 0x00f000f0, + 0x00f000ff, 0x00f00f00, 0x00f00f0f, 0x00f00ff0, 0x00f00fff, + 0x00f0f000, 0x00f0f00f, 0x00f0f0f0, 0x00f0f0ff, 0x00f0ff00, + 0x00f0ff0f, 0x00f0fff0, 0x00f0ffff, 0x00ff0000, 0x00ff000f, + 0x00ff00f0, 0x00ff00ff, 0x00ff0f00, 0x00ff0f0f, 0x00ff0ff0, + 0x00ff0fff, 0x00fff000, 0x00fff00f, 0x00fff0f0, 0x00fff0ff, + 0x00ffff00, 0x00ffff0f, 0x00fffff0, 0x00ffffff, 0x0f000000, + 0x0f00000f, 0x0f0000f0, 0x0f0000ff, 0x0f000f00, 0x0f000f0f, + 0x0f000ff0, 0x0f000fff, 0x0f00f000, 0x0f00f00f, 0x0f00f0f0, + 0x0f00f0ff, 0x0f00ff00, 0x0f00ff0f, 0x0f00fff0, 0x0f00ffff, + 0x0f0f0000, 0x0f0f000f, 0x0f0f00f0, 0x0f0f00ff, 0x0f0f0f00, + 0x0f0f0f0f, 0x0f0f0ff0, 0x0f0f0fff, 0x0f0ff000, 0x0f0ff00f, + 0x0f0ff0f0, 0x0f0ff0ff, 0x0f0fff00, 0x0f0fff0f, 0x0f0ffff0, + 0x0f0fffff, 0x0ff00000, 0x0ff0000f, 0x0ff000f0, 0x0ff000ff, + 0x0ff00f00, 0x0ff00f0f, 0x0ff00ff0, 0x0ff00fff, 0x0ff0f000, + 0x0ff0f00f, 0x0ff0f0f0, 0x0ff0f0ff, 0x0ff0ff00, 0x0ff0ff0f, + 0x0ff0fff0, 0x0ff0ffff, 0x0fff0000, 0x0fff000f, 0x0fff00f0, + 0x0fff00ff, 0x0fff0f00, 0x0fff0f0f, 0x0fff0ff0, 0x0fff0fff, + 0x0ffff000, 0x0ffff00f, 0x0ffff0f0, 0x0ffff0ff, 0x0fffff00, + 0x0fffff0f, 0x0ffffff0, 0x0fffffff, 0xf0000000, 0xf000000f, + 0xf00000f0, 0xf00000ff, 0xf0000f00, 0xf0000f0f, 0xf0000ff0, + 0xf0000fff, 0xf000f000, 0xf000f00f, 0xf000f0f0, 0xf000f0ff, + 0xf000ff00, 0xf000ff0f, 0xf000fff0, 0xf000ffff, 0xf00f0000, + 0xf00f000f, 0xf00f00f0, 0xf00f00ff, 0xf00f0f00, 0xf00f0f0f, + 0xf00f0ff0, 0xf00f0fff, 0xf00ff000, 0xf00ff00f, 0xf00ff0f0, + 0xf00ff0ff, 0xf00fff00, 0xf00fff0f, 0xf00ffff0, 0xf00fffff, + 0xf0f00000, 0xf0f0000f, 0xf0f000f0, 0xf0f000ff, 0xf0f00f00, + 0xf0f00f0f, 0xf0f00ff0, 0xf0f00fff, 0xf0f0f000, 0xf0f0f00f, + 0xf0f0f0f0, 0xf0f0f0ff, 0xf0f0ff00, 0xf0f0ff0f, 0xf0f0fff0, + 0xf0f0ffff, 0xf0ff0000, 0xf0ff000f, 0xf0ff00f0, 0xf0ff00ff, + 0xf0ff0f00, 0xf0ff0f0f, 0xf0ff0ff0, 0xf0ff0fff, 0xf0fff000, + 0xf0fff00f, 0xf0fff0f0, 0xf0fff0ff, 0xf0ffff00, 0xf0ffff0f, + 0xf0fffff0, 0xf0ffffff, 0xff000000, 0xff00000f, 0xff0000f0, + 0xff0000ff, 0xff000f00, 0xff000f0f, 0xff000ff0, 0xff000fff, + 0xff00f000, 0xff00f00f, 0xff00f0f0, 0xff00f0ff, 0xff00ff00, + 0xff00ff0f, 0xff00fff0, 0xff00ffff, 0xff0f0000, 0xff0f000f, + 0xff0f00f0, 0xff0f00ff, 0xff0f0f00, 0xff0f0f0f, 0xff0f0ff0, + 0xff0f0fff, 0xff0ff000, 0xff0ff00f, 0xff0ff0f0, 0xff0ff0ff, + 0xff0fff00, 0xff0fff0f, 0xff0ffff0, 0xff0fffff, 0xfff00000, + 0xfff0000f, 0xfff000f0, 0xfff000ff, 0xfff00f00, 0xfff00f0f, + 0xfff00ff0, 0xfff00fff, 0xfff0f000, 0xfff0f00f, 0xfff0f0f0, + 0xfff0f0ff, 0xfff0ff00, 0xfff0ff0f, 0xfff0fff0, 0xfff0ffff, + 0xffff0000, 0xffff000f, 0xffff00f0, 0xffff00ff, 0xffff0f00, + 0xffff0f0f, 0xffff0ff0, 0xffff0fff, 0xfffff000, 0xfffff00f, + 0xfffff0f0, 0xfffff0ff, 0xffffff00, 0xffffff0f, 0xfffffff0, + 0xffffffff }; -IWRAM_CODE -static inline +// Create a mask for zero sprite values in each nibble. +// For example: 0x12305008 -> 0xFFF0F00F +INLINE u32 -decode_1bpp(u8 row, u8 flip_x) { - if (flip_x) { - return dec_byte_flip_x[row]; - } - return dec_byte[row]; +create_zero_mask(u32 x) { + x |= x >> 2; + x |= x >> 1; + x &= 0x11111111; + return x * 0xf; } -#else -static u16 dec_nibble[] = { - 0x0000, 0x1000, 0x0100, 0x1100, - 0x0010, 0x1010, 0x0110, 0x1110, - 0x0001, 0x1001, 0x0101, 0x1101, - 0x0011, 0x1011, 0x0111, 0x1111, -}; - -static u16 dec_nibble_flip_x[] = { - 0x0000, 0x0001, 0x0010, 0x0011, - 0x0100, 0x0101, 0x0110, 0x0111, - 0x1000, 0x1001, 0x1010, 0x1011, - 0x1100, 0x1101, 0x1110, 0x1111, -}; - -IWRAM_CODE -static inline -u32 -decode_1bpp(u8 row, u8 flip_x) { - if (flip_x) { - u16 *lut = dec_nibble_flip_x; - return (u32)lut[(row >> 4) & 0xF] << 16 | (u32)lut[(row >> 0) & 0xF]; - } - u16 *lut = dec_nibble; - return (u32)lut[(row >> 0) & 0xF] << 16 | (u32)lut[(row >> 4) & 0xF]; -} -#endif IWRAM_CODE UNROLL_LOOPS void -draw_chr(size_t x, size_t y, u8 *sprite, u8 clr, u8 flip_x, u8 flip_y) { +draw_sprite(size_t x, size_t y, u32 *sprite, u8 clear) { + // Copy a 4bpp sprite into memory. Color 0 is the transparency color. BOUNDCHECK_SCREEN(x, y); size_t tile_x0 = x / 8; size_t tile_x1 = (x + 7) / 8; @@ -491,71 +470,65 @@ draw_chr(size_t x, size_t y, u8 *sprite, u8 clr, u8 flip_x, u8 flip_y) { size_t shift_right = (8 - start_col) * 4; u32 dirty = (1 << tile_x0) | (1 << tile_x1); u32 *dst = &backbuf[start_row + (tile_x0 + tile_y * 32) * 8]; -#if DEC_BIG_LUT - u32 *lut = flip_x ? dec_byte_flip_x : dec_byte; -#endif - if (!flip_y) { - for(size_t v = 0; v < 8; v++, dst++) { - if ((y + v) >= SCREEN_HEIGHT) break; - u8 ch1 = sprite[v + 0]; - u8 ch2 = sprite[v + 8]; -#if DEC_BIG_LUT - u32 clr_a = lut[ch1]; - u32 clr_b = lut[ch2]; -#else - u32 clr_a = decode_1bpp(ch1, flip_x); - u32 clr_b = decode_1bpp(ch2, flip_x); -#endif - u32 mask_a = (clr_a * 0xF); - u32 mask_b = (clr_b * 0xF); - u32 mask = (mask_a | mask_b); - u32 color; - if (clr == 0) { - color = clr_a + (clr_b << 1); - } else if (clr == 15) { - color = 0; - } else { - color = (clr_a | clr_b) * clr; - } - dst[0] = (dst[0] & ~(mask << shift_left)) | (color << shift_left); - dst[8] = (dst[8] & ~(mask >> shift_right)) | (color >> shift_right); - if ((start_row + v) == 7) { - dirty_tiles[tile_y + 1] |= dirty; - dst += (32 - 1) * 8; - } + size_t n_rows = 8; + if (y + 8 > SCREEN_HEIGHT) { + n_rows = 8 - ((y + 8) - SCREEN_HEIGHT); + } + + size_t n0 = MIN(8 - start_row, n_rows); + if (clear) { + for(size_t v = 0; v < n0; v++, dst++) { + u32 row = sprite[v]; + + u32 mask = create_zero_mask(row); + u32 msk0 = mask << shift_left; + u32 msk1 = mask >> shift_right; + + dst[0] = (dst[0] & ~msk0); + dst[8] = (dst[8] & ~msk1); + } + dst += (32 - 1) * 8; + for(size_t v = n0; v < n_rows; v++, dst++) { + u32 row = sprite[v]; + + u32 mask = create_zero_mask(row); + u32 msk0 = mask << shift_left; + u32 msk1 = mask >> shift_right; + + dst[0] = (dst[0] & ~msk0); + dst[8] = (dst[8] & ~msk1); } } else { - for(size_t v = 0; v < 8; v++, dst++) { - if ((y + v) >= SCREEN_HEIGHT) break; - u8 ch1 = sprite[(7 - v) + 0]; - u8 ch2 = sprite[(7 - v) + 8]; -#if DEC_BIG_LUT - u32 clr_a = lut[ch1]; - u32 clr_b = lut[ch2]; -#else - u32 clr_a = decode_1bpp(ch1, flip_x); - u32 clr_b = decode_1bpp(ch2, flip_x); -#endif - u32 mask_a = (clr_a * 0xF); - u32 mask_b = (clr_b * 0xF); - u32 mask = (mask_a | mask_b); - u32 color; - if (clr == 0) { - color = clr_a + (clr_b << 1); - } else if (clr == 15) { - color = 0; - } else { - color = (clr_a | clr_b) * clr; - } - dst[0] = (dst[0] & ~(mask << shift_left)) | (color << shift_left); - dst[8] = (dst[8] & ~(mask >> shift_right)) | (color >> shift_right); - if ((start_row + v) == 7) { - dirty_tiles[tile_y + 1] |= dirty; - dst += (32 - 1) * 8; - } + for(size_t v = 0; v < n0; v++, dst++) { + u32 row = sprite[v]; + + u32 mask = create_zero_mask(row); + u32 msk0 = mask << shift_left; + u32 msk1 = mask >> shift_right; + u32 clr0 = row << shift_left; + u32 clr1 = row >> shift_right; + + dst[0] = (dst[0] & ~msk0) | clr0; + dst[8] = (dst[8] & ~msk1) | clr1; + } + dst += (32 - 1) * 8; + for(size_t v = n0; v < n_rows; v++, dst++) { + u32 row = sprite[v]; + + u32 mask = create_zero_mask(row); + u32 msk0 = mask << shift_left; + u32 msk1 = mask >> shift_right; + u32 clr0 = row << shift_left; + u32 clr1 = row >> shift_right; + + dst[0] = (dst[0] & ~msk0) | clr0; + dst[8] = (dst[8] & ~msk1) | clr1; } } dirty_tiles[tile_y] |= dirty; + if (start_row != 0) { + dirty_tiles[tile_y + 1] |= dirty; + } } IWRAM_CODE @@ -572,47 +545,63 @@ draw_icn(size_t x, size_t y, u8 *sprite, u8 clr, u8 flip_x, u8 flip_y) { size_t shift_right = (8 - start_col) * 4; u32 dirty = (1 << tile_x0) | (1 << tile_x1); u32 *dst = &backbuf[start_row + (tile_x0 + tile_y * 32) * 8]; -#if DEC_BIG_LUT - u32 *lut = flip_x ? dec_byte_flip_x : dec_byte; -#endif + u32 color = clr * 0x11111111; + u32 *lut = flip_x ? lut_1bpp_mask_flip_x : lut_1bpp_mask; + size_t n_rows = 8; + if (y + 8 > SCREEN_HEIGHT) { + n_rows = 8 - ((y + 8) - SCREEN_HEIGHT); + } + size_t n0 = MIN(8 - start_row, n_rows); if (!flip_y) { - for(size_t v = 0; v < 8; v++, dst++) { - if ((y + v) >= SCREEN_HEIGHT) break; - u8 ch1 = sprite[v + 0]; -#if DEC_BIG_LUT - u32 color = lut[ch1]; -#else - u32 color = decode_1bpp(ch1, flip_x); -#endif - u32 mask = (color * 0xF); - color *= clr; - dst[0] = (dst[0] & ~(mask << shift_left)) | (color << shift_left); - dst[8] = (dst[8] & ~(mask >> shift_right)) | (color >> shift_right); - if ((start_row + v) == 7) { - dirty_tiles[tile_y + 1] |= dirty; - dst += (32 - 1) * 8; - } + for(size_t v = 0; v < n0; v++, dst++) { + u32 mask = lut[*sprite]; + u32 msk0 = mask << shift_left; + u32 msk1 = mask >> shift_right; + u32 clr0 = msk0 & color; + u32 clr1 = msk1 & color; + dst[0] = (dst[0] & ~msk0) | clr0; + dst[8] = (dst[8] & ~msk1) | clr1; + sprite++; + } + dst += (32 - 1) * 8; + for(size_t v = n0; v < n_rows; v++, dst++) { + u32 mask = lut[*sprite]; + u32 msk0 = mask << shift_left; + u32 msk1 = mask >> shift_right; + u32 clr0 = msk0 & color; + u32 clr1 = msk1 & color; + dst[0] = (dst[0] & ~msk0) | clr0; + dst[8] = (dst[8] & ~msk1) | clr1; + sprite++; } } else { - for(size_t v = 0; v < 8; v++, dst++) { - if ((y + v) >= SCREEN_HEIGHT) break; - u8 ch1 = sprite[(7 - v) + 0]; -#if DEC_BIG_LUT - u32 color = lut[ch1]; -#else - u32 color = decode_1bpp(ch1, flip_x); -#endif - u32 mask = (color * 0xF); - color *= clr; - dst[0] = (dst[0] & ~(mask << shift_left)) | (color << shift_left); - dst[8] = (dst[8] & ~(mask >> shift_right)) | (color >> shift_right); - if ((start_row + v) == 7) { - dirty_tiles[tile_y + 1] |= dirty; - dst += (32 - 1) * 8; - } + sprite += 7; + for(size_t v = 0; v < n0; v++, dst++) { + u32 mask = lut[*sprite]; + u32 msk0 = mask << shift_left; + u32 msk1 = mask >> shift_right; + u32 clr0 = msk0 & color; + u32 clr1 = msk1 & color; + dst[0] = (dst[0] & ~msk0) | clr0; + dst[8] = (dst[8] & ~msk1) | clr1; + sprite--; + } + dst += (32 - 1) * 8; + for(size_t v = n0; v < n_rows; v++, dst++) { + u32 mask = lut[*sprite]; + u32 msk0 = mask << shift_left; + u32 msk1 = mask >> shift_right; + u32 clr0 = msk0 & color; + u32 clr1 = msk1 & color; + dst[0] = (dst[0] & ~msk0) | clr0; + dst[8] = (dst[8] & ~msk1) | clr1; + sprite--; } } dirty_tiles[tile_y] |= dirty; + if (start_row != 0) { + dirty_tiles[tile_y + 1] |= dirty; + } } // @@ -620,9 +609,11 @@ draw_icn(size_t x, size_t y, u8 *sprite, u8 clr, u8 flip_x, u8 flip_y) { // IWRAM_CODE +UNROLL_LOOPS void flip_buffer(void) { -// Mode 0: double buffering without dirty tiles. +// Mode 0: double buffering without dirty tiles. Use this when we are clearing +// the screen every single frame. #if FLIP_TYPE == 0 if (backbuf == BUF_0) { backbuf = BUF_1; @@ -635,7 +626,7 @@ flip_buffer(void) { } // Mode 1: single buffer, copy the dirty lines from backbuffer (BUF_1) to -// frontbuffer (BUF_0) using the DMA. +// frontbuffer (BUF_0). #elif FLIP_TYPE == 1 u32 *front = BUF_0; u32 *back = BUF_1; @@ -646,7 +637,11 @@ flip_buffer(void) { continue; } u32 offset = j * 32 * 8; +#if NO_DMA == 0 dma_copy(front + offset, back + offset, (30 * 8 * 4), 3); +#else + copy32(front + offset, back + offset, (30 * 8)); +#endif dirty_tiles[j] = 0; } @@ -673,7 +668,7 @@ flip_buffer(void) { } // Mode 3: Double buffering with dirty line, copying the dirty lines if needed -// after flipping buffers with the DMA. +// after flipping buffers. #elif FLIP_TYPE == 3 bool should_flip = false; for (size_t j = 0; j < 20; ++j) { @@ -701,7 +696,11 @@ flip_buffer(void) { continue; } u32 offset = j * 32 * 8; +#if NO_DMA == 0 dma_copy(backbuf + offset, frontbuf + offset, (30 * 8 * 4), 3); +#else + copy32(backbuf + offset, frontbuf + offset, (30 * 8)); +#endif dirty_tiles[j] = 0; } @@ -746,6 +745,21 @@ flip_buffer(void) { #endif } +IWRAM_CODE +UNROLL_LOOPS +void +decode_1bpp(u32 *dst, u8 *src, u8 clr, u8 flip_x, u32 n_tiles) { + u32 color = 0x11111111 * clr; + if (!flip_x) { + for (size_t i = 0; i < n_tiles * 8; i++) { + *dst++ = lut_1bpp_mask[*src++] & color; + } + } else { + for (size_t i = 0; i < n_tiles * 8; i++) { + *dst++ = lut_1bpp_mask_flip_x[*src++] & color; + } + } +} // // Text rendering. // @@ -846,7 +860,11 @@ renderer_init(void) { DISP_CTRL = DISP_MODE_0 | DISP_BG_0 | DISP_BG_1; // Clear VRAM. +#if NO_DMA == 0 dma_fill((u32*)MEM_VRAM, 0, KB(96), 3); +#else + set32((u32*)MEM_VRAM, 0, KB(96)/4); +#endif // Initialize backgrounds. BG_CTRL(0) = BG_CHARBLOCK(CB_0) | BG_SCREENBLOCK(SB_0) | BG_PRIORITY(0) | BG_SIZE(1); -- cgit v1.2.1