From 6dce0bf202267616fc63c3b818fc9439def925a5 Mon Sep 17 00:00:00 2001
From: Bad Diode <bd@badd10de.dev>
Date: Fri, 28 May 2021 17:17:01 +0200
Subject: Vectorize downsample for audio mixdown

This results in 3K cycles less per sound_mix call
---
 src/uxn/devices/apu.c | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

(limited to 'src')

diff --git a/src/uxn/devices/apu.c b/src/uxn/devices/apu.c
index d968840..0b502f8 100644
--- a/src/uxn/devices/apu.c
+++ b/src/uxn/devices/apu.c
@@ -183,7 +183,7 @@ void sound_mix() {
     u32 fill = 0;
     dma_fill(mix_buffer, fill, sizeof(mix_buffer), 3);
 
-    //  Mix channels into the temporary buffer.
+    // Mix channels into the temporary buffer.
     for (size_t j = 0; j < POLYPHONY; ++j) {
         AudioChannel *ch = &channels[j];
         // Check if channel is active.
@@ -203,7 +203,7 @@ void sound_mix() {
             // stopping.
             for(size_t i = 0; i < AUDIO_BUF_LEN; i++) {
                 // Remember we are using fixed point values.
-                mix_buffer[i] += (0x80 + (u8)ch->data[ch->pos >> 12]) * vol;
+                mix_buffer[i] += (0x80 ^ ch->data[ch->pos >> 12]) * vol;
                 ch->pos += ch->inc;
 
                 if (ch->pos >= ch->length) {
@@ -223,16 +223,20 @@ void sound_mix() {
             // Sample still have room to go, no need to check for looping or
             // end of sample.
             for(size_t i = 0; i < AUDIO_BUF_LEN; i++) {
-                mix_buffer[i] +=  (0x80 + (u8)ch->data[ch->pos>>12]) * ch->vol;
+                mix_buffer[i] +=  (0x80 ^ ch->data[ch->pos>>12]) * vol;
                 ch->pos += ch->inc;
             }
         }
     }
 
-    // Downsample and copy to the playing buffer.
-    for (size_t i = 0; i < AUDIO_BUF_LEN; ++i) {
-        // >> 6 to divide off the volume, >> 2 to divide by 4 channels to
-        // prevent overflow.
-        audio.current_buffer[i] = mix_buffer[i] >> 8;
+    // Downsample and copy to the playing buffer (Vectorized).
+    u64 *mix = mix_buffer;
+    u32 *buf = audio.current_buffer;
+    for (size_t i = 0, k = 0; i < AUDIO_BUF_LEN; i += 4, k++) {
+        u64 x = mix[k];
+        buf[k] = (x >> 8) & 0xFF
+            | (x >> 16) & 0xFF00
+            | (x >> 24) & 0xFF0000
+            | (x >> 32) & 0xFF000000;
     }
 }
-- 
cgit v1.2.1