diff options
author | Bad Diode <bd@badd10de.dev> | 2024-01-23 11:21:14 +0100 |
---|---|---|
committer | Bad Diode <bd@badd10de.dev> | 2024-01-23 11:21:14 +0100 |
commit | 3c54d94191b0888af3712f7c330943068604cab8 (patch) | |
tree | 9207d386470d084fc1f23becd7dfd0039204bef4 /src/gba/utils.s | |
parent | a7ce765b1b57ec8a528263420852ed36da6d9d84 (diff) | |
download | stepper-3c54d94191b0888af3712f7c330943068604cab8.tar.gz stepper-3c54d94191b0888af3712f7c330943068604cab8.zip |
Add improved renderer routines with DMA option
Diffstat (limited to 'src/gba/utils.s')
-rwxr-xr-x | src/gba/utils.s | 82 |
1 files changed, 82 insertions, 0 deletions
diff --git a/src/gba/utils.s b/src/gba/utils.s new file mode 100755 index 0000000..d70d7ef --- /dev/null +++ b/src/gba/utils.s | |||
@@ -0,0 +1,82 @@ | |||
1 | .file "utils.s" | ||
2 | .section .iwram, "ax", %progbits | ||
3 | .arm | ||
4 | .align | ||
5 | |||
6 | @ Efficient memcpy32 function (borrowed from TONC). It uses a two step | ||
7 | @ approach. It tries to copy 8 u32 chunks at a time with the ldm and stm | ||
8 | @ instructions and then copy the remainder if there are less than 8 chunks | ||
9 | @ left. | ||
10 | @ | ||
11 | @ r0: destination address | ||
12 | @ r1: source address | ||
13 | @ r2: number of 32bit chunks to copy | ||
14 | @ | ||
15 | .global copy32 | ||
16 | copy32: | ||
17 | cmp r2, #0 | ||
18 | beq .copy32_end | ||
19 | |||
20 | and r12, r2, #7 @ r12 = r2 % 8 | ||
21 | movs r2, r2, lsr #3 @ r2 = r2 / 8 | ||
22 | beq .Lcopy32_residual | ||
23 | |||
24 | @ Copy 8 32B chunks at a time | ||
25 | push {r4-r10} | ||
26 | .Lcopy32_chunks: | ||
27 | ldmia r1!, {r3-r10} | ||
28 | stmia r0!, {r3-r10} | ||
29 | subs r2, r2, #1 | ||
30 | bhi .Lcopy32_chunks | ||
31 | pop {r4-r10} | ||
32 | |||
33 | @ Copy residual 32B chunks (0-7) | ||
34 | .Lcopy32_residual: | ||
35 | subs r12, r12, #1 | ||
36 | ldrhs r3, [r1], #4 | ||
37 | strhs r3, [r0], #4 | ||
38 | bhi .Lcopy32_residual | ||
39 | |||
40 | .copy32_end: | ||
41 | bx lr | ||
42 | |||
43 | |||
44 | @ Efficient memset32 function (borrowed from TONC). It uses a two step | ||
45 | @ approach. Uses the same stmia approach from memcpy32 but, no need for ldmia | ||
46 | @ | ||
47 | @ r0: destination address | ||
48 | @ r1: u32 value to set | ||
49 | @ r2: number of 32bit chunks to set | ||
50 | @ | ||
51 | .global set32 | ||
52 | set32: | ||
53 | cmp r2, #0 | ||
54 | beq .set32_end | ||
55 | |||
56 | and r12, r2, #7 @ r12 = r2 % 8 | ||
57 | movs r2, r2, lsr #3 @ r2 = r2 / 8 | ||
58 | beq .Lset32_residual | ||
59 | |||
60 | @ Set 8 32B chunks at a time | ||
61 | push {r4-r9} | ||
62 | mov r3, r1 | ||
63 | mov r4, r1 | ||
64 | mov r5, r1 | ||
65 | mov r6, r1 | ||
66 | mov r7, r1 | ||
67 | mov r8, r1 | ||
68 | mov r9, r1 | ||
69 | .Lset32_chunks: | ||
70 | stmia r0!, {r1, r3-r9} | ||
71 | subs r2, r2, #1 | ||
72 | bhi .Lset32_chunks | ||
73 | pop {r4-r9} | ||
74 | |||
75 | @ Set residual 32B chunks (0-7) | ||
76 | .Lset32_residual: | ||
77 | subs r12, r12, #1 | ||
78 | strhs r1, [r0], #4 | ||
79 | bhi .Lset32_residual | ||
80 | |||
81 | .set32_end: | ||
82 | bx lr | ||