From f3f221524e6be30217838661b4750820a7bebecf Mon Sep 17 00:00:00 2001 From: Bad Diode Date: Fri, 4 Jun 2021 13:38:46 +0200 Subject: Add initial performance optimization for rect draw --- src/main.c | 14 ++++++++---- src/renderer.c | 72 ++++++++++++++++++++++++++++++++++++++++++++-------------- 2 files changed, 65 insertions(+), 21 deletions(-) diff --git a/src/main.c b/src/main.c index ab41915..8df466f 100644 --- a/src/main.c +++ b/src/main.c @@ -50,6 +50,15 @@ WITH REGARD TO THIS SOFTWARE. #define PROF_INIT() #endif +void +test_rects() { + for (size_t i = 0; i < 100; i++) { + draw_rect(30, 30, 45, 45, 1); + draw_rect(35, 35, 60, 40, 2); + draw_rect(1, 1, 6, 6, 3); + } +} + int main(void) { // Adjust system wait times. SYSTEM_WAIT = SYSTEM_WAIT_CARTRIDGE; @@ -64,14 +73,11 @@ int main(void) { irq_init(); irs_set(IRQ_VBLANK, irs_stub); - // Main loop. PROF_INIT(); while (true) { bios_vblank_wait(); - txt_drawf("Hello world: %d", 4, 4, 6, 10); - draw_rect(30, 30, 45, 45, 1); - draw_rect(35, 35, 60, 40, 2); + PROF(test_rects(), eval_cycles); PROF_SHOW(); PROF(flip_buffer(), flip_cycles); } diff --git a/src/renderer.c b/src/renderer.c index 4aa583d..51647cb 100644 --- a/src/renderer.c +++ b/src/renderer.c @@ -47,24 +47,62 @@ draw_pixel(u16 x, u16 y, u8 color) { IWRAM_CODE void draw_rect(int x0, int y0, int x1, int y1, u8 clr) { - if (x0 > x1) { - int tmp = x0; - x0 = x1; - x1 = tmp; - } - if (y0 > y1) { - int tmp = y0; - y0 = y1; - y1 = tmp; - } - int dx = x1 - x0; - int dy = y1 - y0; - // TODO: SLOW should be vectorized. - for (int i = 0; i <= dx; ++i) { - draw_pixel(x0 + i, y0, clr); - draw_pixel(x0 + i, y1, clr); + BOUNDCHECK_SCREEN(x0, y0); + BOUNDCHECK_SCREEN(x1, y1); + + // Find row positions for the given x/y coordinates. + size_t tile_x0 = x0 / 8; + size_t tile_y0 = y0 / 8; + size_t tile_x1 = x1 / 8; + size_t tile_y1 = y1 / 8; + size_t start_col0 = x0 % 8; + size_t start_col1 = x1 % 8; + size_t start_row0 = y0 % 8; + size_t start_row1 = y1 % 8; + + // Get a pointer to the backbuffer and the tile row. + u32 *backbuffer0 = &BACKBUF[start_row0 + (tile_x0 + tile_y0 * 32) * 8]; + u32 *backbuffer1 = &BACKBUF[start_row1 + (tile_x0 + tile_y1 * 32) * 8]; + + u16 dx = tile_x1 - tile_x0; + u16 dy = y1 - y0; + + // There are 3 cases: + // 1. Lines fit on a single tile. + // 2. Lines go through 2 tiles, both require partial row updates. + // 3. Lines go through 3 or more tiles, first and last tiles use partial + // row updates, rows in the middle can write the. + if (dx < 1) { + u32 row_mask = 0xFFFFFFFF; + row_mask >>= (7 - start_col1 - dx) * 4; + row_mask &= 0xFFFFFFFF << start_col0 * 4; + u32 row = (0x11111111 * clr) & row_mask; + backbuffer0[0] = (backbuffer0[0] & ~row_mask) | row; + backbuffer1[0] = (backbuffer1[0] & ~row_mask) | row; + dirty_tiles[tile_y0] |= 1 << tile_x0; + dirty_tiles[tile_y1] |= 1 << tile_x0; + } else { + size_t shift_left = start_col0 * 4; + size_t shift_right = (7 - start_col1) * 4; + u32 row_mask = 0xFFFFFFFF; + u32 row = 0x11111111 * clr; + backbuffer0[0] = (backbuffer0[0] & ~(row_mask << shift_left)) | (row << shift_left); + backbuffer1[0] = (backbuffer1[0] & ~(row_mask << shift_left)) | (row << shift_left); + dirty_tiles[tile_y0] |= 1 << tile_x0; + dirty_tiles[tile_y1] |= 1 << tile_x0; + for (size_t i = 1; i < dx; i++) { + backbuffer0[i * 8] = row; + backbuffer1[i * 8] = row; + dirty_tiles[tile_y0] |= 1 << tile_x0 + i; + dirty_tiles[tile_y1] |= 1 << tile_x0 + i; + } + backbuffer0[dx * 8] = (backbuffer0[dx * 8] & ~(row_mask >> shift_right)) | (row >> shift_right); + backbuffer1[dx * 8] = (backbuffer1[dx * 8] & ~(row_mask >> shift_right)) | (row >> shift_right); + dirty_tiles[tile_y0] |= 1 << tile_x0 + dx; + dirty_tiles[tile_y1] |= 1 << tile_x0 + dx; } - for (int i = 0; i <= dy; ++i) { + // The vertical line cases are analogous to the horizontal cases. + for (int i = 1; i < dy; ++i) { draw_pixel(x0, y0 + i, clr); draw_pixel(x1, y0 + i, clr); } -- cgit v1.2.1