diff options
author | Bad Diode <bd@badd10de.dev> | 2023-04-19 10:05:03 +0200 |
---|---|---|
committer | Bad Diode <bd@badd10de.dev> | 2023-04-19 15:28:08 +0200 |
commit | 5f47e14f6ab4e3b346de1d62c65452e674edbebe (patch) | |
tree | 7b4ff41a4667c61b3a2a40c764aaf222f267b909 | |
parent | 1b1765d3a27b97dc282b6295aa206d92c32f33d9 (diff) | |
download | uxngba-5f47e14f6ab4e3b346de1d62c65452e674edbebe.tar.gz uxngba-5f47e14f6ab4e3b346de1d62c65452e674edbebe.zip |
Add optimized 1bpp drawing function
-rw-r--r-- | src/common.h | 6 | ||||
-rw-r--r-- | src/main.c | 41 | ||||
-rw-r--r-- | src/ppu.c | 546 |
3 files changed, 466 insertions, 127 deletions
diff --git a/src/common.h b/src/common.h index 0d6c89f..e62c78c 100644 --- a/src/common.h +++ b/src/common.h | |||
@@ -745,4 +745,10 @@ wait_vsync(void) { | |||
745 | #define EWRAM_CODE __attribute__((section(".ewram"), long_call)) | 745 | #define EWRAM_CODE __attribute__((section(".ewram"), long_call)) |
746 | #define EWRAM_BSS __attribute__((section(".sbss"))) | 746 | #define EWRAM_BSS __attribute__((section(".sbss"))) |
747 | 747 | ||
748 | // | ||
749 | // Compiler hints. | ||
750 | // | ||
751 | |||
752 | #define UNROLL_LOOPS __attribute__((optimize("unroll-loops"))) | ||
753 | |||
748 | #endif // GBAEXP_COMMON_H | 754 | #endif // GBAEXP_COMMON_H |
@@ -37,6 +37,8 @@ | |||
37 | #define CONTROL_METHODS CONTROL_CONTROLLER,CONTROL_MOUSE,CONTROL_KEYBOARD | 37 | #define CONTROL_METHODS CONTROL_CONTROLLER,CONTROL_MOUSE,CONTROL_KEYBOARD |
38 | #endif | 38 | #endif |
39 | 39 | ||
40 | #define PROF_ENABLE 1 | ||
41 | |||
40 | #ifdef PROF_ENABLE | 42 | #ifdef PROF_ENABLE |
41 | #if PROF_ENABLE == 0 | 43 | #if PROF_ENABLE == 0 |
42 | #define TEXT_ENABLE 1 | 44 | #define TEXT_ENABLE 1 |
@@ -51,22 +53,38 @@ | |||
51 | #ifndef PROF_SHOW_Y | 53 | #ifndef PROF_SHOW_Y |
52 | #define PROF_SHOW_Y 0 | 54 | #define PROF_SHOW_Y 0 |
53 | #endif | 55 | #endif |
56 | // #define PROF_SHOW() \ | ||
57 | // do { \ | ||
58 | // txt_position((PROF_SHOW_X), (PROF_SHOW_Y));\ | ||
59 | // txt_printf("INPUT: %lu ", input_cycles);\ | ||
60 | // txt_position((PROF_SHOW_X), (PROF_SHOW_Y)+1);\ | ||
61 | // txt_printf("EVAL: %lu ", eval_cycles);\ | ||
62 | // txt_position((PROF_SHOW_X), (PROF_SHOW_Y)+2);\ | ||
63 | // txt_printf("FLIP: %lu ", flip_cycles);\ | ||
64 | // txt_position((PROF_SHOW_X), (PROF_SHOW_Y)+3);\ | ||
65 | // txt_printf("MIX: %lu ", mix_cycles);\ | ||
66 | // } while (0) | ||
54 | #define PROF_SHOW() \ | 67 | #define PROF_SHOW() \ |
55 | do { \ | 68 | do { \ |
56 | txt_position((PROF_SHOW_X), (PROF_SHOW_Y));\ | 69 | txt_position((PROF_SHOW_X), (PROF_SHOW_Y));\ |
57 | txt_printf("INPUT: %lu ", input_cycles);\ | 70 | txt_printf("PIX: %lu ", ppu_pixel_cycles);\ |
58 | txt_position((PROF_SHOW_X), (PROF_SHOW_Y)+1);\ | 71 | txt_position((PROF_SHOW_X), (PROF_SHOW_Y)+1);\ |
59 | txt_printf("EVAL: %lu ", eval_cycles);\ | 72 | txt_printf("2BPP: %lu ", ppu_chr_cycles);\ |
60 | txt_position((PROF_SHOW_X), (PROF_SHOW_Y)+2);\ | 73 | txt_position((PROF_SHOW_X), (PROF_SHOW_Y)+2);\ |
61 | txt_printf("FLIP: %lu ", flip_cycles);\ | 74 | txt_printf("1BPP: %lu ", ppu_icn_cycles);\ |
62 | txt_position((PROF_SHOW_X), (PROF_SHOW_Y)+3);\ | 75 | txt_position((PROF_SHOW_X), (PROF_SHOW_Y)+3);\ |
63 | txt_printf("MIX: %lu ", mix_cycles);\ | 76 | txt_printf("FLIP: %lu ", flip_cycles);\ |
64 | } while (0) | 77 | } while (0) |
65 | #define PROF_INIT() \ | 78 | #define PROF_INIT() \ |
66 | u32 flip_cycles = 0;\ | 79 | static u32 ppu_pixel_cycles = 0;\ |
67 | u32 eval_cycles = 0;\ | 80 | static u32 ppu_chr_cycles = 0;\ |
68 | u32 input_cycles = 0;\ | 81 | static u32 ppu_icn_cycles = 0;\ |
69 | u32 mix_cycles = 0; | 82 | static u32 flip_cycles = 0;\ |
83 | static u32 eval_cycles = 0;\ | ||
84 | static u32 input_cycles = 0;\ | ||
85 | static u32 mix_cycles = 0; | ||
86 | |||
87 | PROF_INIT(); | ||
70 | #else | 88 | #else |
71 | #define PROF(F,VAR) (F) | 89 | #define PROF(F,VAR) (F) |
72 | #define PROF_SHOW() | 90 | #define PROF_SHOW() |
@@ -131,7 +149,7 @@ screen_deo(u8 *ram, u8 *d, u8 port) { | |||
131 | u8 layer = d[0xe] & 0x40; | 149 | u8 layer = d[0xe] & 0x40; |
132 | PEKDEV(x, 0x8); | 150 | PEKDEV(x, 0x8); |
133 | PEKDEV(y, 0xa); | 151 | PEKDEV(y, 0xa); |
134 | ppu_pixel(layer ? ppu.fg : ppu.bg, x, y, d[0xe] & 0x3); | 152 | PROF(ppu_pixel(layer ? ppu.fg : ppu.bg, x, y, d[0xe] & 0x3), ppu_pixel_cycles); |
135 | if(d[0x6] & 0x01) POKDEV(0x8, x + 1); /* auto x+1 */ | 153 | if(d[0x6] & 0x01) POKDEV(0x8, x + 1); /* auto x+1 */ |
136 | if(d[0x6] & 0x02) POKDEV(0xa, y + 1); /* auto y+1 */ | 154 | if(d[0x6] & 0x02) POKDEV(0xa, y + 1); /* auto y+1 */ |
137 | break; | 155 | break; |
@@ -155,9 +173,9 @@ screen_deo(u8 *ram, u8 *d, u8 port) { | |||
155 | for(size_t i = 0; i <= n; i++) { | 173 | for(size_t i = 0; i <= n; i++) { |
156 | u8 *sprite = &ram[addr]; | 174 | u8 *sprite = &ram[addr]; |
157 | if (twobpp) { | 175 | if (twobpp) { |
158 | ppu_2bpp(layer, x + dy * i, y + dx * i, sprite, color, flipx, flipy); | 176 | PROF(ppu_2bpp(layer, x + dy * i, y + dx * i, sprite, color, flipx, flipy), ppu_chr_cycles); |
159 | } else { | 177 | } else { |
160 | ppu_1bpp(layer, x + dy * i, y + dx * i, sprite, color, flipx, flipy); | 178 | PROF(ppu_1bpp(layer, x + dy * i, y + dx * i, sprite, color, flipx, flipy), ppu_icn_cycles); |
161 | } | 179 | } |
162 | addr += (d[0x6] & 0x04) << (1 + twobpp); | 180 | addr += (d[0x6] & 0x04) << (1 + twobpp); |
163 | } | 181 | } |
@@ -564,7 +582,6 @@ main(void) { | |||
564 | 582 | ||
565 | // Main loop. | 583 | // Main loop. |
566 | uxn_eval(&u, PAGE_PROGRAM); | 584 | uxn_eval(&u, PAGE_PROGRAM); |
567 | PROF_INIT(); | ||
568 | u8 frame_counter = 0; | 585 | u8 frame_counter = 0; |
569 | while(true) { | 586 | while(true) { |
570 | bios_vblank_wait(); | 587 | bios_vblank_wait(); |
@@ -15,6 +15,8 @@ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES | |||
15 | WITH REGARD TO THIS SOFTWARE. | 15 | WITH REGARD TO THIS SOFTWARE. |
16 | */ | 16 | */ |
17 | 17 | ||
18 | #define NEW_PPU 1 | ||
19 | |||
18 | #define FG_FRONT ((u32*)(MEM_VRAM)) | 20 | #define FG_FRONT ((u32*)(MEM_VRAM)) |
19 | #define BG_FRONT ((u32*)(MEM_VRAM + KB(20))) | 21 | #define BG_FRONT ((u32*)(MEM_VRAM + KB(20))) |
20 | #define FG_BACK ((u32*)(MEM_VRAM + KB(44))) | 22 | #define FG_BACK ((u32*)(MEM_VRAM + KB(44))) |
@@ -22,6 +24,18 @@ WITH REGARD TO THIS SOFTWARE. | |||
22 | #define TILE_MAP ((u32*)(MEM_VRAM + KB(40))) | 24 | #define TILE_MAP ((u32*)(MEM_VRAM + KB(40))) |
23 | #define FONT_DATA ((u32*)(MEM_VRAM + KB(84))) | 25 | #define FONT_DATA ((u32*)(MEM_VRAM + KB(84))) |
24 | 26 | ||
27 | #ifdef DISABLE_BOUNDCHECK_SCREEN | ||
28 | #define BOUNDCHECK_SCREEN(X,Y) | ||
29 | #else | ||
30 | #define BOUNDCHECK_SCREEN(X,Y) if ((X) >= SCREEN_WIDTH || (Y) >= SCREEN_HEIGHT) return; | ||
31 | #endif | ||
32 | |||
33 | // Swap A and B values without a tmp variable. | ||
34 | #define SWAP(A, B) (((A) ^= (B)), ((B) ^= (A)), ((A) ^= (B))) | ||
35 | |||
36 | // Swap A and B values to make sure A <= B. | ||
37 | #define MAYBE_SWAP(A,B) if ((A) > (B)) { SWAP(A,B); } | ||
38 | |||
25 | // Keyboard. | 39 | // Keyboard. |
26 | #define SPRITE_START_IDX 640 | 40 | #define SPRITE_START_IDX 640 |
27 | 41 | ||
@@ -135,6 +149,116 @@ static u32 lut2bpp_flipx[256] = { | |||
135 | 0x11111111 | 149 | 0x11111111 |
136 | }; | 150 | }; |
137 | 151 | ||
152 | static u32 dec_byte_flip_x[256] = { | ||
153 | 0x00000000, 0x00000001, 0x00000010, 0x00000011, 0x00000100, | ||
154 | 0x00000101, 0x00000110, 0x00000111, 0x00001000, 0x00001001, | ||
155 | 0x00001010, 0x00001011, 0x00001100, 0x00001101, 0x00001110, | ||
156 | 0x00001111, 0x00010000, 0x00010001, 0x00010010, 0x00010011, | ||
157 | 0x00010100, 0x00010101, 0x00010110, 0x00010111, 0x00011000, | ||
158 | 0x00011001, 0x00011010, 0x00011011, 0x00011100, 0x00011101, | ||
159 | 0x00011110, 0x00011111, 0x00100000, 0x00100001, 0x00100010, | ||
160 | 0x00100011, 0x00100100, 0x00100101, 0x00100110, 0x00100111, | ||
161 | 0x00101000, 0x00101001, 0x00101010, 0x00101011, 0x00101100, | ||
162 | 0x00101101, 0x00101110, 0x00101111, 0x00110000, 0x00110001, | ||
163 | 0x00110010, 0x00110011, 0x00110100, 0x00110101, 0x00110110, | ||
164 | 0x00110111, 0x00111000, 0x00111001, 0x00111010, 0x00111011, | ||
165 | 0x00111100, 0x00111101, 0x00111110, 0x00111111, 0x01000000, | ||
166 | 0x01000001, 0x01000010, 0x01000011, 0x01000100, 0x01000101, | ||
167 | 0x01000110, 0x01000111, 0x01001000, 0x01001001, 0x01001010, | ||
168 | 0x01001011, 0x01001100, 0x01001101, 0x01001110, 0x01001111, | ||
169 | 0x01010000, 0x01010001, 0x01010010, 0x01010011, 0x01010100, | ||
170 | 0x01010101, 0x01010110, 0x01010111, 0x01011000, 0x01011001, | ||
171 | 0x01011010, 0x01011011, 0x01011100, 0x01011101, 0x01011110, | ||
172 | 0x01011111, 0x01100000, 0x01100001, 0x01100010, 0x01100011, | ||
173 | 0x01100100, 0x01100101, 0x01100110, 0x01100111, 0x01101000, | ||
174 | 0x01101001, 0x01101010, 0x01101011, 0x01101100, 0x01101101, | ||
175 | 0x01101110, 0x01101111, 0x01110000, 0x01110001, 0x01110010, | ||
176 | 0x01110011, 0x01110100, 0x01110101, 0x01110110, 0x01110111, | ||
177 | 0x01111000, 0x01111001, 0x01111010, 0x01111011, 0x01111100, | ||
178 | 0x01111101, 0x01111110, 0x01111111, 0x10000000, 0x10000001, | ||
179 | 0x10000010, 0x10000011, 0x10000100, 0x10000101, 0x10000110, | ||
180 | 0x10000111, 0x10001000, 0x10001001, 0x10001010, 0x10001011, | ||
181 | 0x10001100, 0x10001101, 0x10001110, 0x10001111, 0x10010000, | ||
182 | 0x10010001, 0x10010010, 0x10010011, 0x10010100, 0x10010101, | ||
183 | 0x10010110, 0x10010111, 0x10011000, 0x10011001, 0x10011010, | ||
184 | 0x10011011, 0x10011100, 0x10011101, 0x10011110, 0x10011111, | ||
185 | 0x10100000, 0x10100001, 0x10100010, 0x10100011, 0x10100100, | ||
186 | 0x10100101, 0x10100110, 0x10100111, 0x10101000, 0x10101001, | ||
187 | 0x10101010, 0x10101011, 0x10101100, 0x10101101, 0x10101110, | ||
188 | 0x10101111, 0x10110000, 0x10110001, 0x10110010, 0x10110011, | ||
189 | 0x10110100, 0x10110101, 0x10110110, 0x10110111, 0x10111000, | ||
190 | 0x10111001, 0x10111010, 0x10111011, 0x10111100, 0x10111101, | ||
191 | 0x10111110, 0x10111111, 0x11000000, 0x11000001, 0x11000010, | ||
192 | 0x11000011, 0x11000100, 0x11000101, 0x11000110, 0x11000111, | ||
193 | 0x11001000, 0x11001001, 0x11001010, 0x11001011, 0x11001100, | ||
194 | 0x11001101, 0x11001110, 0x11001111, 0x11010000, 0x11010001, | ||
195 | 0x11010010, 0x11010011, 0x11010100, 0x11010101, 0x11010110, | ||
196 | 0x11010111, 0x11011000, 0x11011001, 0x11011010, 0x11011011, | ||
197 | 0x11011100, 0x11011101, 0x11011110, 0x11011111, 0x11100000, | ||
198 | 0x11100001, 0x11100010, 0x11100011, 0x11100100, 0x11100101, | ||
199 | 0x11100110, 0x11100111, 0x11101000, 0x11101001, 0x11101010, | ||
200 | 0x11101011, 0x11101100, 0x11101101, 0x11101110, 0x11101111, | ||
201 | 0x11110000, 0x11110001, 0x11110010, 0x11110011, 0x11110100, | ||
202 | 0x11110101, 0x11110110, 0x11110111, 0x11111000, 0x11111001, | ||
203 | 0x11111010, 0x11111011, 0x11111100, 0x11111101, 0x11111110, | ||
204 | 0x11111111 | ||
205 | }; | ||
206 | |||
207 | static u32 dec_byte[256] = { | ||
208 | 0x00000000, 0x10000000, 0x01000000, 0x11000000, 0x00100000, | ||
209 | 0x10100000, 0x01100000, 0x11100000, 0x00010000, 0x10010000, | ||
210 | 0x01010000, 0x11010000, 0x00110000, 0x10110000, 0x01110000, | ||
211 | 0x11110000, 0x00001000, 0x10001000, 0x01001000, 0x11001000, | ||
212 | 0x00101000, 0x10101000, 0x01101000, 0x11101000, 0x00011000, | ||
213 | 0x10011000, 0x01011000, 0x11011000, 0x00111000, 0x10111000, | ||
214 | 0x01111000, 0x11111000, 0x00000100, 0x10000100, 0x01000100, | ||
215 | 0x11000100, 0x00100100, 0x10100100, 0x01100100, 0x11100100, | ||
216 | 0x00010100, 0x10010100, 0x01010100, 0x11010100, 0x00110100, | ||
217 | 0x10110100, 0x01110100, 0x11110100, 0x00001100, 0x10001100, | ||
218 | 0x01001100, 0x11001100, 0x00101100, 0x10101100, 0x01101100, | ||
219 | 0x11101100, 0x00011100, 0x10011100, 0x01011100, 0x11011100, | ||
220 | 0x00111100, 0x10111100, 0x01111100, 0x11111100, 0x00000010, | ||
221 | 0x10000010, 0x01000010, 0x11000010, 0x00100010, 0x10100010, | ||
222 | 0x01100010, 0x11100010, 0x00010010, 0x10010010, 0x01010010, | ||
223 | 0x11010010, 0x00110010, 0x10110010, 0x01110010, 0x11110010, | ||
224 | 0x00001010, 0x10001010, 0x01001010, 0x11001010, 0x00101010, | ||
225 | 0x10101010, 0x01101010, 0x11101010, 0x00011010, 0x10011010, | ||
226 | 0x01011010, 0x11011010, 0x00111010, 0x10111010, 0x01111010, | ||
227 | 0x11111010, 0x00000110, 0x10000110, 0x01000110, 0x11000110, | ||
228 | 0x00100110, 0x10100110, 0x01100110, 0x11100110, 0x00010110, | ||
229 | 0x10010110, 0x01010110, 0x11010110, 0x00110110, 0x10110110, | ||
230 | 0x01110110, 0x11110110, 0x00001110, 0x10001110, 0x01001110, | ||
231 | 0x11001110, 0x00101110, 0x10101110, 0x01101110, 0x11101110, | ||
232 | 0x00011110, 0x10011110, 0x01011110, 0x11011110, 0x00111110, | ||
233 | 0x10111110, 0x01111110, 0x11111110, 0x00000001, 0x10000001, | ||
234 | 0x01000001, 0x11000001, 0x00100001, 0x10100001, 0x01100001, | ||
235 | 0x11100001, 0x00010001, 0x10010001, 0x01010001, 0x11010001, | ||
236 | 0x00110001, 0x10110001, 0x01110001, 0x11110001, 0x00001001, | ||
237 | 0x10001001, 0x01001001, 0x11001001, 0x00101001, 0x10101001, | ||
238 | 0x01101001, 0x11101001, 0x00011001, 0x10011001, 0x01011001, | ||
239 | 0x11011001, 0x00111001, 0x10111001, 0x01111001, 0x11111001, | ||
240 | 0x00000101, 0x10000101, 0x01000101, 0x11000101, 0x00100101, | ||
241 | 0x10100101, 0x01100101, 0x11100101, 0x00010101, 0x10010101, | ||
242 | 0x01010101, 0x11010101, 0x00110101, 0x10110101, 0x01110101, | ||
243 | 0x11110101, 0x00001101, 0x10001101, 0x01001101, 0x11001101, | ||
244 | 0x00101101, 0x10101101, 0x01101101, 0x11101101, 0x00011101, | ||
245 | 0x10011101, 0x01011101, 0x11011101, 0x00111101, 0x10111101, | ||
246 | 0x01111101, 0x11111101, 0x00000011, 0x10000011, 0x01000011, | ||
247 | 0x11000011, 0x00100011, 0x10100011, 0x01100011, 0x11100011, | ||
248 | 0x00010011, 0x10010011, 0x01010011, 0x11010011, 0x00110011, | ||
249 | 0x10110011, 0x01110011, 0x11110011, 0x00001011, 0x10001011, | ||
250 | 0x01001011, 0x11001011, 0x00101011, 0x10101011, 0x01101011, | ||
251 | 0x11101011, 0x00011011, 0x10011011, 0x01011011, 0x11011011, | ||
252 | 0x00111011, 0x10111011, 0x01111011, 0x11111011, 0x00000111, | ||
253 | 0x10000111, 0x01000111, 0x11000111, 0x00100111, 0x10100111, | ||
254 | 0x01100111, 0x11100111, 0x00010111, 0x10010111, 0x01010111, | ||
255 | 0x11010111, 0x00110111, 0x10110111, 0x01110111, 0x11110111, | ||
256 | 0x00001111, 0x10001111, 0x01001111, 0x11001111, 0x00101111, | ||
257 | 0x10101111, 0x01101111, 0x11101111, 0x00011111, 0x10011111, | ||
258 | 0x01011111, 0x11011111, 0x00111111, 0x10111111, 0x01111111, | ||
259 | 0x11111111 | ||
260 | }; | ||
261 | |||
138 | static u8 blending[5][16] = { | 262 | static u8 blending[5][16] = { |
139 | {0, 0, 0, 0, 1, 0, 1, 1, 2, 2, 0, 2, 3, 3, 3, 0}, | 263 | {0, 0, 0, 0, 1, 0, 1, 1, 2, 2, 0, 2, 3, 3, 3, 0}, |
140 | {0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3}, | 264 | {0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3}, |
@@ -181,6 +305,141 @@ ppu_pixel(u32 *layer, u16 x, u16 y, u8 color) { | |||
181 | } | 305 | } |
182 | 306 | ||
183 | IWRAM_CODE | 307 | IWRAM_CODE |
308 | static inline | ||
309 | u32 | ||
310 | decode_1bpp(u8 row, u8 flip_x) { | ||
311 | return flip_x ? dec_byte_flip_x[row] : dec_byte[row]; | ||
312 | } | ||
313 | |||
314 | IWRAM_CODE | ||
315 | static inline | ||
316 | void | ||
317 | draw_1bpp_row(u32 *layer, size_t x, size_t y, u8 sprite, u8 clr, u8 flip_x) { | ||
318 | BOUNDCHECK_SCREEN(x, y); | ||
319 | |||
320 | size_t tile_x = x / 8; | ||
321 | size_t tile_y = y / 8; | ||
322 | size_t start_col = x % 8; | ||
323 | size_t start_row = y % 8; | ||
324 | size_t shift_left = start_col * 4; | ||
325 | size_t shift_right = (8 - start_col) * 4; | ||
326 | |||
327 | u32 *dst = &layer[start_row + (tile_x + tile_y * 32) * 8]; | ||
328 | u32 color = decode_1bpp(sprite, flip_x); | ||
329 | u32 mask = ~color; | ||
330 | color *= clr; | ||
331 | if (start_col == 0) { | ||
332 | dst[0] = (dst[0] & ~mask) | color; | ||
333 | } else { | ||
334 | dst[0] = (dst[0] & ~(mask << shift_left)) | (color << shift_left); | ||
335 | dst[8] = (dst[8] & ~(mask >> shift_right)) | (color >> shift_right); | ||
336 | } | ||
337 | |||
338 | // TODO: different blend modes? | ||
339 | } | ||
340 | |||
341 | IWRAM_CODE | ||
342 | void | ||
343 | draw_icn(u32 * layer, size_t x, size_t y, u8 *sprite, u8 clr, u8 flip_x, u8 flip_y) { | ||
344 | BOUNDCHECK_SCREEN(x, y); | ||
345 | if (!flip_y) { | ||
346 | for(size_t v = 0; v < 8; v++) { | ||
347 | if ((y + v) >= SCREEN_HEIGHT) break; | ||
348 | u8 ch1 = sprite[v]; | ||
349 | draw_1bpp_row(layer, x, y + v, ch1, clr, flip_x); | ||
350 | } | ||
351 | } else { | ||
352 | for(size_t v = 0; v < 8; v++) { | ||
353 | if ((y + v) >= SCREEN_HEIGHT) break; | ||
354 | u8 ch1 = sprite[(7 - v)]; | ||
355 | draw_1bpp_row(layer, x, y + v, ch1, clr, flip_x); | ||
356 | } | ||
357 | } | ||
358 | } | ||
359 | |||
360 | #if NEW_PPU == 1 | ||
361 | IWRAM_CODE | ||
362 | UNROLL_LOOPS | ||
363 | void | ||
364 | ppu_1bpp(u32 *layer, u16 x, u16 y, u8 *sprite, u8 clr, u8 flip_x, u8 flip_y) { | ||
365 | BOUNDCHECK_SCREEN(x, y); | ||
366 | |||
367 | size_t tile_x = x / 8; | ||
368 | size_t tile_y = y / 8; | ||
369 | size_t start_col = x % 8; | ||
370 | size_t start_row = y % 8; | ||
371 | size_t shift_left = start_col * 4; | ||
372 | size_t shift_right = (8 - start_col) * 4; | ||
373 | u32 *dst = &layer[start_row + (tile_x + tile_y * 32) * 8]; | ||
374 | if (blending[4][clr]) { | ||
375 | u64 mask = ~((u64)0xFFFFFFFF); | ||
376 | if (!flip_y) { | ||
377 | for(size_t v = 0; v < 8; v++, dst++) { | ||
378 | if ((y + v) >= SCREEN_HEIGHT) break; | ||
379 | u8 ch1 = sprite[v]; | ||
380 | u32 color_1 = decode_1bpp(ch1, flip_x); | ||
381 | u32 color_2 = (color_1 ^ 0xffffffff) & 0x11111111; | ||
382 | u32 color = (color_1 * (clr & 3)) | (color_2 * (clr >> 2)); | ||
383 | if (start_col == 0) { | ||
384 | dst[0] = (dst[0] & mask) | color; | ||
385 | } else { | ||
386 | dst[0] = (dst[0] & (mask << shift_left)) | color; | ||
387 | dst[8] = (dst[8] & (mask >> shift_right)) | (color >> shift_right); | ||
388 | } | ||
389 | } | ||
390 | } else { | ||
391 | for(size_t v = 0; v < 8; v++, dst++) { | ||
392 | if ((y + v) >= SCREEN_HEIGHT) break; | ||
393 | u8 ch1 = sprite[(7 - v)]; | ||
394 | u32 color_1 = decode_1bpp(ch1, flip_x); | ||
395 | u32 color_2 = (color_1 ^ 0xffffffff) & 0x11111111; | ||
396 | u32 color = (color_1 * (clr & 3)) | (color_2 * (clr >> 2)); | ||
397 | if (start_col == 0) { | ||
398 | dst[0] = (dst[0] & mask) | color; | ||
399 | } else { | ||
400 | dst[0] = (dst[0] & (mask << shift_left)) | color; | ||
401 | dst[8] = (dst[8] & (mask >> shift_right)) | (color >> shift_right); | ||
402 | } | ||
403 | } | ||
404 | } | ||
405 | } else { | ||
406 | if (!flip_y) { | ||
407 | for(size_t v = 0; v < 8; v++, dst++) { | ||
408 | if ((y + v) >= SCREEN_HEIGHT) break; | ||
409 | u8 ch1 = sprite[v]; | ||
410 | u32 color = decode_1bpp(ch1, flip_x); | ||
411 | u32 mask = ~color; | ||
412 | color *= clr; | ||
413 | if (start_col == 0) { | ||
414 | dst[0] = (dst[0] & ~mask) | color; | ||
415 | } else { | ||
416 | dst[0] = (dst[0] & ~(mask << shift_left)) | (color << shift_left); | ||
417 | dst[8] = (dst[8] & ~(mask >> shift_right)) | (color >> shift_right); | ||
418 | } | ||
419 | } | ||
420 | } else { | ||
421 | for(size_t v = 0; v < 8; v++, dst++) { | ||
422 | if ((y + v) >= SCREEN_HEIGHT) break; | ||
423 | u8 ch1 = sprite[(7 - v)]; | ||
424 | u32 color = decode_1bpp(ch1, flip_x); | ||
425 | u32 mask = ~color; | ||
426 | color *= clr; | ||
427 | if (start_col == 0) { | ||
428 | dst[0] = (dst[0] & ~mask) | color; | ||
429 | } else { | ||
430 | dst[0] = (dst[0] & ~(mask << shift_left)) | (color << shift_left); | ||
431 | dst[8] = (dst[8] & ~(mask >> shift_right)) | (color >> shift_right); | ||
432 | } | ||
433 | } | ||
434 | } | ||
435 | } | ||
436 | |||
437 | // dirty_tiles[y >> 3] |= dirtyflag; | ||
438 | // dirty_tiles[(y + 7) >> 3] |= dirtyflag; | ||
439 | } | ||
440 | |||
441 | #else | ||
442 | IWRAM_CODE | ||
184 | void | 443 | void |
185 | ppu_1bpp(u32 *layer, u16 x, u16 y, u8 *sprite, u8 color, u8 flipx, u8 flipy) { | 444 | ppu_1bpp(u32 *layer, u16 x, u16 y, u8 *sprite, u8 color, u8 flipx, u8 flipy) { |
186 | u8 sprline; | 445 | u8 sprline; |
@@ -194,7 +453,7 @@ ppu_1bpp(u32 *layer, u16 x, u16 y, u8 *sprite, u8 color, u8 flipx, u8 flipy) { | |||
194 | 453 | ||
195 | if (flipy) flipy = 7; | 454 | if (flipy) flipy = 7; |
196 | 455 | ||
197 | if (x >= SCREEN_WIDTH || y >= SCREEN_HEIGHT) return; | 456 | BOUNDCHECK_SCREEN(x, y); |
198 | 457 | ||
199 | if (blending[4][color]) { | 458 | if (blending[4][color]) { |
200 | u64 mask = ~((u64)0xFFFFFFFF << shift); | 459 | u64 mask = ~((u64)0xFFFFFFFF << shift); |
@@ -229,126 +488,183 @@ ppu_1bpp(u32 *layer, u16 x, u16 y, u8 *sprite, u8 color, u8 flipx, u8 flipy) { | |||
229 | dirty_tiles[y >> 3] |= dirtyflag; | 488 | dirty_tiles[y >> 3] |= dirtyflag; |
230 | dirty_tiles[(y + 7) >> 3] |= dirtyflag; | 489 | dirty_tiles[(y + 7) >> 3] |= dirtyflag; |
231 | } | 490 | } |
491 | #endif | ||
232 | 492 | ||
233 | IWRAM_CODE | 493 | IWRAM_CODE |
494 | static inline | ||
234 | void | 495 | void |
235 | ppu_2bpp(u32 *layer, u16 x, u16 y, u8 *sprite, u8 color, | 496 | draw_2bpp_row(void *layer, size_t x, size_t y, u8 a, u8 b, u8 flip_x) { |
236 | u8 flipx, u8 flipy) { | 497 | // BOUNDCHECK_SCREEN(x, y); |
237 | u8 sprline1, sprline2; | ||
238 | u8 xrightedge = x < ((32 - 1) * 8); | ||
239 | u16 v, h; | ||
240 | u32 dirtyflag = (1 << (x >> 3)) | (1 << ((x + 7) >> 3)); | ||
241 | |||
242 | u32 layerpos = ((y & 7) + (((x >> 3) + (y >> 3) * 32) * 8)); | ||
243 | u32 *layerptr = &layer[layerpos]; | ||
244 | u32 shift = (x & 7) << 2; | ||
245 | |||
246 | if (flipy) flipy = 7; | ||
247 | |||
248 | if (x >= SCREEN_WIDTH || y >= SCREEN_HEIGHT) return; | ||
249 | |||
250 | if (color == 1) { | ||
251 | u32 *lut_expand = flipx ? lut_2bpp : lut2bpp_flipx; | ||
252 | u64 mask = ~((u64)0xFFFFFFFF << shift); | ||
253 | |||
254 | for (v = 0; v < 8; v++, layerptr++) { | ||
255 | if ((y + v) >= (24 * 8)) break; | ||
256 | |||
257 | sprline1 = sprite[v ^ flipy]; | ||
258 | sprline2 = sprite[(v ^ flipy) | 8]; | ||
259 | |||
260 | u32 data32 = (lut_expand[sprline1]) | (lut_expand[sprline2] << 1); | ||
261 | u64 data = ((u64) (data32 & 0x33333333)) << shift; | ||
262 | |||
263 | layerptr[0] = (layerptr[0] & mask) | data; | ||
264 | if (xrightedge) layerptr[8] = (layerptr[8] & (mask >> 32)) | (data >> 32); | ||
265 | |||
266 | if (((y + v) & 7) == 7) layerptr += (32 - 1) * 8; | ||
267 | } | ||
268 | } else if (blending[4][color]) { | ||
269 | u64 mask = ~((u64)0xFFFFFFFF << shift); | ||
270 | |||
271 | for (v = 0; v < 8; v++, layerptr++) { | ||
272 | if ((y + v) >= (24 * 8)) break; | ||
273 | |||
274 | u8 ch1 = sprite[v ^ flipy]; | ||
275 | u8 ch2 = sprite[(v ^ flipy) | 8]; | ||
276 | u32 data32 = 0; | ||
277 | |||
278 | if (!flipx) { | ||
279 | for (h = 0; h < 8; h++) { | ||
280 | data32 <<= 4; | ||
281 | |||
282 | u8 ch = (ch1 & 1) | ((ch2 & 1) << 1); | ||
283 | data32 |= blending[ch][color]; | ||
284 | |||
285 | ch1 >>= 1; ch2 >>= 1; | ||
286 | } | ||
287 | } else { | ||
288 | for (h = 0; h < 8; h++) { | ||
289 | data32 <<= 4; | ||
290 | |||
291 | u8 ch = (ch1 >> 7) | ((ch2 >> 7) << 1); | ||
292 | data32 |= blending[ch][color]; | ||
293 | |||
294 | ch1 <<= 1; ch2 <<= 1; | ||
295 | } | ||
296 | } | ||
297 | |||
298 | u64 data = ((u64) (data32 & 0x33333333)) << shift; | ||
299 | |||
300 | layerptr[0] = (layerptr[0] & mask) | data; | ||
301 | if (xrightedge) layerptr[8] = (layerptr[8] & (mask >> 32)) | (data >> 32); | ||
302 | 498 | ||
303 | if (((y + v) & 7) == 7) layerptr += (32 - 1) * 8; | 499 | size_t tile_x = x / 8; |
304 | } | 500 | size_t tile_y = y / 8; |
501 | size_t start_col = x % 8; | ||
502 | size_t start_row = y % 8; | ||
503 | size_t shift_left = start_col * 4; | ||
504 | size_t shift_right = (8 - start_col) * 4; | ||
505 | |||
506 | u32 *dst = &layer[start_row + (tile_x + tile_y * 32) * 8]; | ||
507 | // #if DEC_BIG_LUT | ||
508 | u32 *lut = dec_byte; | ||
509 | if (flip_x) { | ||
510 | lut = dec_byte_flip_x; | ||
511 | } | ||
512 | u32 clr_a = lut[a]; | ||
513 | u32 clr_b = lut[b]; | ||
514 | // #else | ||
515 | // u32 clr_a = decode_1bpp(a, flip_x); | ||
516 | // u32 clr_b = decode_1bpp(b, flip_x); | ||
517 | // #endif | ||
518 | u32 mask_a = (clr_a * 0xF); | ||
519 | u32 mask_b = (clr_b * 0xF); | ||
520 | u32 mask = (mask_a | mask_b); | ||
521 | u32 color = clr_a + (clr_b << 1); | ||
522 | if (start_col == 0) { | ||
523 | dst[0] = (dst[0] & ~mask) | color; | ||
305 | } else { | 524 | } else { |
306 | for (v = 0; v < 8; v++, layerptr++) { | 525 | dst[0] = (dst[0] & ~(mask << shift_left)) | (color << shift_left); |
307 | if ((y + v) >= (24 * 8)) break; | 526 | dst[8] = (dst[8] & ~(mask >> shift_right)) | (color >> shift_right); |
308 | 527 | } | |
309 | u8 ch1 = sprite[v ^ flipy]; | ||
310 | u8 ch2 = sprite[(v ^ flipy) | 8]; | ||
311 | u32 data32 = 0; | ||
312 | u32 mask32 = 0; | ||
313 | |||
314 | if (!flipx) { | ||
315 | for (h = 0; h < 8; h++) { | ||
316 | data32 <<= 4; mask32 <<= 4; | ||
317 | |||
318 | if ((ch1 | ch2) & 1) { | ||
319 | u8 ch = (ch1 & 1) | ((ch2 & 1) << 1); | ||
320 | data32 |= blending[ch][color]; | ||
321 | mask32 |= 0xF; | ||
322 | } | ||
323 | 528 | ||
324 | ch1 >>= 1; ch2 >>= 1; | 529 | // TODO: different blend modes? |
325 | } | 530 | } |
326 | } else { | ||
327 | for (h = 0; h < 8; h++) { | ||
328 | data32 <<= 4; mask32 <<= 4; | ||
329 | 531 | ||
330 | if ((ch1 | ch2) & 128) { | 532 | IWRAM_CODE |
331 | u8 ch = (ch1 >> 7) | ((ch2 >> 7) << 1); | 533 | void |
332 | data32 |= blending[ch][color]; | 534 | ppu_2bpp(u32 *layer, u16 x, u16 y, u8 *sprite, u8 color, |
333 | mask32 |= 0xF; | 535 | u8 flip_x, u8 flip_y) { |
334 | } | 536 | // u32 *dst = &layer[0]; |
537 | // *dst = 0x111111111; | ||
538 | // if (!flip_y) { | ||
539 | // for(size_t v = 0; v < 8; v++) { | ||
540 | // // if ((y + v) >= SCREEN_HEIGHT) break; | ||
541 | // u8 ch1 = sprite[v + 0]; | ||
542 | // u8 ch2 = sprite[v + 8]; | ||
543 | // draw_2bpp_row(layer, x, y + v, ch1, ch2, flip_x); | ||
544 | // } | ||
545 | // } else { | ||
546 | // for(size_t v = 0; v < 8; v++) { | ||
547 | // // if ((y + v) >= SCREEN_HEIGHT) break; | ||
548 | // u8 ch1 = sprite[(7 - v) + 0]; | ||
549 | // u8 ch2 = sprite[(7 - v) + 8]; | ||
550 | // draw_2bpp_row(layer, x, y + v, ch1, ch2, flip_x); | ||
551 | // } | ||
552 | // } | ||
553 | // u8 sprline1, sprline2; | ||
554 | // u8 xrightedge = x < ((32 - 1) * 8); | ||
555 | // u16 v, h; | ||
556 | // u32 dirtyflag = (1 << (x >> 3)) | (1 << ((x + 7) >> 3)); | ||
557 | |||
558 | // u32 layerpos = ((y & 7) + (((x >> 3) + (y >> 3) * 32) * 8)); | ||
559 | // u32 *layerptr = &layer[layerpos]; | ||
560 | // u32 shift = (x & 7) << 2; | ||
561 | |||
562 | // if (flip_y) flip_y = 7; | ||
563 | |||
564 | // if (x >= SCREEN_WIDTH || y >= SCREEN_HEIGHT) return; | ||
565 | |||
566 | // if (color == 1) { | ||
567 | // u32 *lut_expand = flip_x ? lut_2bpp : lut2bpp_flipx; | ||
568 | // u64 mask = ~((u64)0xFFFFFFFF << shift); | ||
569 | |||
570 | // for (v = 0; v < 8; v++, layerptr++) { | ||
571 | // if ((y + v) >= (24 * 8)) break; | ||
572 | |||
573 | // sprline1 = sprite[v ^ flip_y]; | ||
574 | // sprline2 = sprite[(v ^ flip_y) | 8]; | ||
575 | |||
576 | // u32 data32 = (lut_expand[sprline1]) | (lut_expand[sprline2] << 1); | ||
577 | // u64 data = ((u64) (data32 & 0x33333333)) << shift; | ||
578 | |||
579 | // layerptr[0] = (layerptr[0] & mask) | data; | ||
580 | // if (xrightedge) layerptr[8] = (layerptr[8] & (mask >> 32)) | (data >> 32); | ||
581 | |||
582 | // if (((y + v) & 7) == 7) layerptr += (32 - 1) * 8; | ||
583 | // } | ||
584 | // } else if (blending[4][color]) { | ||
585 | // u64 mask = ~((u64)0xFFFFFFFF << shift); | ||
586 | |||
587 | // for (v = 0; v < 8; v++, layerptr++) { | ||
588 | // if ((y + v) >= (24 * 8)) break; | ||
589 | |||
590 | // u8 ch1 = sprite[v ^ flip_y]; | ||
591 | // u8 ch2 = sprite[(v ^ flip_y) | 8]; | ||
592 | // u32 data32 = 0; | ||
593 | |||
594 | // if (!flip_x) { | ||
595 | // for (h = 0; h < 8; h++) { | ||
596 | // data32 <<= 4; | ||
597 | |||
598 | // u8 ch = (ch1 & 1) | ((ch2 & 1) << 1); | ||
599 | // data32 |= blending[ch][color]; | ||
600 | |||
601 | // ch1 >>= 1; ch2 >>= 1; | ||
602 | // } | ||
603 | // } else { | ||
604 | // for (h = 0; h < 8; h++) { | ||
605 | // data32 <<= 4; | ||
606 | |||
607 | // u8 ch = (ch1 >> 7) | ((ch2 >> 7) << 1); | ||
608 | // data32 |= blending[ch][color]; | ||
609 | |||
610 | // ch1 <<= 1; ch2 <<= 1; | ||
611 | // } | ||
612 | // } | ||
613 | |||
614 | // u64 data = ((u64) (data32 & 0x33333333)) << shift; | ||
615 | |||
616 | // layerptr[0] = (layerptr[0] & mask) | data; | ||
617 | // if (xrightedge) layerptr[8] = (layerptr[8] & (mask >> 32)) | (data >> 32); | ||
618 | |||
619 | // if (((y + v) & 7) == 7) layerptr += (32 - 1) * 8; | ||
620 | // } | ||
621 | // } else { | ||
622 | // for (v = 0; v < 8; v++, layerptr++) { | ||
623 | // if ((y + v) >= (24 * 8)) break; | ||
624 | |||
625 | // u8 ch1 = sprite[v ^ flip_y]; | ||
626 | // u8 ch2 = sprite[(v ^ flip_y) | 8]; | ||
627 | // u32 data32 = 0; | ||
628 | // u32 mask32 = 0; | ||
629 | |||
630 | // if (!flip_x) { | ||
631 | // for (h = 0; h < 8; h++) { | ||
632 | // data32 <<= 4; mask32 <<= 4; | ||
633 | |||
634 | // if ((ch1 | ch2) & 1) { | ||
635 | // u8 ch = (ch1 & 1) | ((ch2 & 1) << 1); | ||
636 | // data32 |= blending[ch][color]; | ||
637 | // mask32 |= 0xF; | ||
638 | // } | ||
639 | |||
640 | // ch1 >>= 1; ch2 >>= 1; | ||
641 | // } | ||
642 | // } else { | ||
643 | // for (h = 0; h < 8; h++) { | ||
644 | // data32 <<= 4; mask32 <<= 4; | ||
645 | |||
646 | // if ((ch1 | ch2) & 128) { | ||
647 | // u8 ch = (ch1 >> 7) | ((ch2 >> 7) << 1); | ||
648 | // data32 |= blending[ch][color]; | ||
649 | // mask32 |= 0xF; | ||
650 | // } | ||
335 | 651 | ||
336 | ch1 <<= 1; ch2 <<= 1; | 652 | // ch1 <<= 1; ch2 <<= 1; |
337 | } | 653 | // } |
338 | } | 654 | // } |
339 | 655 | ||
340 | u64 data = ((u64) (data32 & 0x33333333)) << shift; | 656 | // u64 data = ((u64) (data32 & 0x33333333)) << shift; |
341 | u64 mask = ~(((u64) (mask32 & 0x33333333)) << shift); | 657 | // u64 mask = ~(((u64) (mask32 & 0x33333333)) << shift); |
342 | 658 | ||
343 | layerptr[0] = (layerptr[0] & mask) | data; | 659 | // layerptr[0] = (layerptr[0] & mask) | data; |
344 | if (xrightedge) layerptr[8] = (layerptr[8] & (mask >> 32)) | (data >> 32); | 660 | // if (xrightedge) layerptr[8] = (layerptr[8] & (mask >> 32)) | (data >> 32); |
345 | 661 | ||
346 | if (((y + v) & 7) == 7) layerptr += (32 - 1) * 8; | 662 | // if (((y + v) & 7) == 7) layerptr += (32 - 1) * 8; |
347 | } | 663 | // } |
348 | } | 664 | // } |
349 | 665 | ||
350 | dirty_tiles[y >> 3] |= dirtyflag; | 666 | // dirty_tiles[y >> 3] |= dirtyflag; |
351 | dirty_tiles[(y + 7) >> 3] |= dirtyflag; | 667 | // dirty_tiles[(y + 7) >> 3] |= dirtyflag; |
352 | } | 668 | } |
353 | 669 | ||
354 | IWRAM_CODE | 670 | IWRAM_CODE |
@@ -369,20 +685,20 @@ flipbuf(Ppu *p) { | |||
369 | Tile *mem_fg = FG_FRONT; | 685 | Tile *mem_fg = FG_FRONT; |
370 | Tile *mem_bg = BG_FRONT; | 686 | Tile *mem_bg = BG_FRONT; |
371 | for (size_t j = 0; j < 20; ++j) { | 687 | for (size_t j = 0; j < 20; ++j) { |
372 | if (dirty_tiles[j] == 0) { | 688 | // if (dirty_tiles[j] == 0) { |
373 | continue; | 689 | // continue; |
374 | } | 690 | // } |
375 | 691 | ||
376 | size_t k = 1; | 692 | size_t k = 1; |
377 | for (size_t i = 0; i < 30; ++i, k <<= 1) { | 693 | for (size_t i = 0; i < 30; ++i, k <<= 1) { |
378 | if (dirty_tiles[j] & k) { | 694 | // if (dirty_tiles[j] & k) { |
379 | Tile *tile_fg = p->fg; | 695 | Tile *tile_fg = p->fg; |
380 | Tile *tile_bg = p->bg; | 696 | Tile *tile_bg = p->bg; |
381 | mem_fg[i + j * 32] = tile_fg[i + j * 32]; | 697 | mem_fg[i + j * 32] = tile_fg[i + j * 32]; |
382 | mem_bg[i + j * 32] = tile_bg[i + j * 32]; | 698 | mem_bg[i + j * 32] = tile_bg[i + j * 32]; |
383 | } | 699 | // } |
384 | } | 700 | } |
385 | dirty_tiles[j] = 0; | 701 | // dirty_tiles[j] = 0; |
386 | } | 702 | } |
387 | } | 703 | } |
388 | 704 | ||