From a02af1a297eb2ea62e58e3208fcf03680c6f9908 Mon Sep 17 00:00:00 2001
From: jenz <jenz@jenz.jenz>
Date: Mon, 1 Jun 2026 20:52:20 +0100
Subject: [PATCH] this is good enough for nosteamers. primary focus point was
 working on decoding the steam voice packets correctly into subframes

---
 extension.cpp | 339 ++++++++++++++++++++++++--------------------------
 extension.h   |   2 -
 2 files changed, 164 insertions(+), 177 deletions(-)

diff --git a/extension.cpp b/extension.cpp
index c8985b8..ad9d499 100644
--- a/extension.cpp
+++ b/extension.cpp
@@ -231,7 +231,6 @@ DETOUR_DECL_MEMBER1(ProcessVoiceData, bool, void *, msg)
     {
         //convert steam OPUS packet to CELT for no steam clients
         g_Interface.PushPlayerVoiceData(playerSlot, nBytes, voiceDataBuffer);
-        g_Interface.HandlePlayerVoiceData(playerSlot); //send celt packets to nosteamers.
         // Send steam Opus packet to Steam clients
         int maxClients = iserver->GetClientCount();
         for (int i = 0; i < maxClients; i++)
@@ -252,7 +251,6 @@ DETOUR_DECL_MEMBER1(ProcessVoiceData, bool, void *, msg)
             SendVoiceDataMsg(playerSlot, pToClient, (unsigned char *)voiceDataBuffer, nBytes, voiceMsg->m_xuid);
         }
     }
-    g_fLastVoiceData[clientIndex] = gpGlobals->curtime;
     return true;
 }
 
@@ -260,25 +258,6 @@ DETOUR_DECL_STATIC4(SV_BroadcastVoiceData, void, IClient *, pClient, int, nBytes
 {
 }
 
-#ifdef _WIN32
-DETOUR_DECL_STATIC2(SV_BroadcastVoiceData_LTCG, void, char *, data, int64, xuid)
-{
-    IClient *pClient = NULL;
-    int nBytes = 0;
-
-    __asm mov pClient, ecx;
-    __asm mov nBytes, edx;
-
-    bool ret = g_Interface.OnBroadcastVoiceData(pClient, nBytes, data);
-
-    __asm mov ecx, pClient;
-    __asm mov edx, nBytes;
-
-    if(ret)
-        DETOUR_STATIC_CALL(SV_BroadcastVoiceData_LTCG)(data, xuid);
-}
-#endif
-
 double getTime()
 {
     struct timespec tv;
@@ -327,7 +306,6 @@ CVoice::CVoice()
     m_pCeltModePlayer  = NULL;
     m_torchMonoAccumLen     = 0;
     m_torchResampleAccum    = 0;
-    memset(m_playerResampleAccum, 0, sizeof(m_playerResampleAccum));
     memset(m_torchMonoAccum,  0, sizeof(m_torchMonoAccum));
     memset(m_nosteamSeqNum, 0, sizeof(m_nosteamSeqNum));
 
@@ -430,18 +408,7 @@ bool CVoice::SDK_OnLoad(char *error, size_t maxlength, bool late)
     // Setup voice detour.
     CDetourManager::Init(g_pSM->GetScriptingEngine(), NULL);
 
-#ifdef _WIN32
-    if (engineVersion == SOURCE_ENGINE_CSGO || engineVersion == SOURCE_ENGINE_INSURGENCY)
-    {
-        m_VoiceDetour = DETOUR_CREATE_STATIC(SV_BroadcastVoiceData_LTCG, adrVoiceData);
-    }
-    else
-    {
-        m_VoiceDetour = DETOUR_CREATE_STATIC(SV_BroadcastVoiceData, adrVoiceData);
-    }
-#else
     m_VoiceDetour = DETOUR_CREATE_STATIC(SV_BroadcastVoiceData, adrVoiceData);
-#endif
 
     if (!m_VoiceDetour)
     {
@@ -462,7 +429,6 @@ bool CVoice::SDK_OnLoad(char *error, size_t maxlength, bool late)
 
     //opus edit
     int err;
-    //m_OpusEncoder = opus_encoder_create(24000, 2, OPUS_APPLICATION_AUDIO, &err);
     m_OpusEncoder = opus_encoder_create(48000, 2, OPUS_APPLICATION_AUDIO, &err);
     if (err<0)
     {
@@ -494,7 +460,8 @@ bool CVoice::SDK_OnLoad(char *error, size_t maxlength, bool late)
 
     // CELT encoder (22050 Hz mono, 512 samples/frame, 64 byte packets)
     m_CeltEncoderSettings.SampleRate_Hz      = 22050;
-    m_CeltEncoderSettings.TargetBitRate_Kbps = 64;
+    //m_CeltEncoderSettings.TargetBitRate_Kbps = 64;
+    m_CeltEncoderSettings.TargetBitRate_Kbps = 48;
     m_CeltEncoderSettings.FrameSize          = 512;
     m_CeltEncoderSettings.PacketSize         = 64;
     m_CeltEncoderSettings.Complexity         = 10;
@@ -594,7 +561,7 @@ cell_t Native_SendCeltVoiceInit(IPluginContext *pContext, const cell_t *params)
 
     SVC_VoiceInit msg("vaudio_celt", 22050);
     pClient->SendNetMsg(msg);
-    smutils->LogMessage(myself, "Sent SVC_VoiceInit vaudio_celt to client %d", client);
+    //smutils->LogMessage(myself, "Sent SVC_VoiceInit vaudio_celt to client %d", client);
     return 1;
 }
 
@@ -807,16 +774,24 @@ void CVoice::OnGameFrame(bool simulating)
     HandleVoiceData(); //torchlight audio emitting
     HandleNoSteamVoiceData(); //send opus packets to steamers.
 
+    //send celt packets to nosteamers
+    int maxClients = iserver->GetClientCount();
+    for (int i = 0; i < maxClients; i++)
+    {
+        if (!m_pCeltCodecPlayer[i])
+            continue;
+
+        // Keep draining 512-sample blocks until the buffer has less than one full frame remaining
+        while (m_playerVoiceBuffer[i].TotalLength() >= m_CeltEncoderSettings.FrameSize)
+        {
+            HandlePlayerVoiceData(i);
+        }
+    }
+
     // Reset per-client voice byte counter to 0 every frame.
     memset(g_aFrameVoiceBytes, 0, sizeof(g_aFrameVoiceBytes));
 }
 
-bool CVoice::OnBroadcastVoiceData(IClient *pClient, int nBytes, char *data)
-{
-    //not actually used anymore anyways
-    return true;
-}
-
 void CVoice::HandleNetwork()
 {
     if(m_ListenSocket == -1)
@@ -1237,11 +1212,43 @@ void CVoice::PushPlayerVoiceData(int playerSlot, int nBytes, char *data)
         }
     }
 
+    double now = (double)gpGlobals->curtime;
+    double timeSinceLastVoice = now - g_fLastVoiceData[playerSlot + 1];
+    if (timeSinceLastVoice > 0.5 && g_fLastVoiceData[playerSlot + 1] != 0.0)
+    {
+        celt_encoder_ctl(m_pCeltCodecPlayer[playerSlot], CELT_RESET_STATE_REQUEST, NULL);
+        opus_decoder_ctl(m_PlayerOpusDecoder[playerSlot], OPUS_RESET_STATE);
+
+        // Clear out any stale, partial remaining samples left over in the queue
+        while(m_playerVoiceBuffer[playerSlot].TotalLength() > 0)
+        {
+            int16_t trash[512];
+            size_t toPop = m_playerVoiceBuffer[playerSlot].TotalLength() > 512 ? 512 : m_playerVoiceBuffer[playerSlot].TotalLength();
+            m_playerVoiceBuffer[playerSlot].Pop(trash, toPop);
+        }
+
+        //smutils->LogMessage(myself, "Voice states flushed cleanly. playerSlot: %d", playerSlot);
+    }
+
+    // Update the timestamp to mark this active packet's arrival
+    g_fLastVoiceData[playerSlot + 1] = now;
+
     const unsigned char *p = (const unsigned char *)data;
 
-    // Verify raw network packets match standard Valve Opus voice signatures
-    if (nBytes < 18 || p[8] != 0x0B || (p[11] != 0x05 && p[11] != 0x06))
-        return;
+    /*
+    int dumpLen = nBytes < 24 ? nBytes : 24;
+    char hexBuf[24 * 3 + 1];
+    int pos = 0;
+    for (int i = 0; i < dumpLen; i++)
+    {
+        static const char hex[] = "0123456789ABCDEF";
+        hexBuf[pos++] = hex[(p[i] >> 4) & 0xF];
+        hexBuf[pos++] = hex[p[i] & 0xF];
+        hexBuf[pos++] = ' ';
+    }
+    hexBuf[pos] = '\0';
+    smutils->LogMessage(myself, "[INBOUND STEAM] nBytes=%d, First bytes: %s", nBytes, hexBuf);
+    */
 
     uint16_t totalDataLength;
     memcpy(&totalDataLength, p + 12, sizeof(uint16_t));
@@ -1258,68 +1265,112 @@ void CVoice::PushPlayerVoiceData(int playerSlot, int nBytes, char *data)
 
     if (offset + 4 <= end)
     {
-        uint16_t frameLen;
-        memcpy(&frameLen, p + offset, sizeof(uint16_t));
-        offset += 2;
-        offset += 2;
+        uint16_t trueAudioPayloadLen;
+        memcpy(&trueAudioPayloadLen, p + 12, sizeof(uint16_t)); 
 
-        if (frameLen <= 2 || offset + (int)(frameLen - 2) > end)
-            return;
+        // Frame 1 always starts exactly at offset 18
+        int frame1Start = 18;
+        if (frame1Start >= end) return;
 
-        const unsigned char *opusFrame = p + offset;
-
-        // Maximum safe decoded PCM space buffer allocation (120ms frame at 24kHz = 2880 samples)
-        int16_t pcmBuf[2880];
-        int decoded = opus_decode(m_PlayerOpusDecoder[playerSlot],
-                                  opusFrame, frameLen - 2,
-                                  pcmBuf, 2880, 0);
-        if (decoded <= 0)
+        const unsigned char *frame1Ptr = p + frame1Start;
+        
+        // Find the start of Frame 2 by scanning for the next 0x68 TOC marker
+        int frame2Start = -1;
+        
+        // A standard 24kHz Opus frame will practically never be shorter than 30 bytes
+        for (int i = frame1Start + 30; i < end - 4; i++)
         {
-            smutils->LogError(myself, "PushPlayerVoiceData: opus_decode failed: %s", opus_strerror(decoded));
-            return;
-        }
-
-        // PHASE-PERFECT FIXED POINT RESAMPLING (24000 Hz -> 22050 Hz)
-        // Shift left into 16.16 fixed-point space to prevent rounding/truncation drift errors
-        uint32_t step_fp = ((uint32_t)24000 << 16) / (uint32_t)m_CeltEncoderSettings.SampleRate_Hz;
-        uint32_t curr_fp = m_playerResampleAccum[playerSlot]; // Treat tracking value storage as raw fixed-point register
-
-        // Allocate temporary staging stack vector array to drop into ring-buffer cleanly in one pass
-        int16_t resampledStaging[2880];
-        int outSamplesCount = 0;
-
-        while (true)
-        {
-            uint32_t srcIndex = curr_fp >> 16;
-            if (srcIndex >= (uint32_t)decoded)
+            if (p[i] == 0x68) //TOC -> 0x68 
+            {
+                // Verify if this is the start of Frame 2
+                frame2Start = i;
                 break;
-
-            // Linear Interpolation over adjacent samples to prevent digital harmonic hiss
-            int16_t s1 = pcmBuf[srcIndex];
-            int16_t s2 = (srcIndex + 1 < (uint32_t)decoded) ? pcmBuf[srcIndex + 1] : s1;
-
-            uint32_t frac = curr_fp & 0xFFFF;
-            int32_t interpolatedSample = s1 + (((int32_t)(s2 - s1) * (int32_t)frac) >> 16);
-
-            resampledStaging[outSamplesCount++] = (int16_t)interpolatedSample;
-            curr_fp += step_fp;
+            }
         }
 
-        // Normalize state trackers back relative to zero base offset
-        m_playerResampleAccum[playerSlot] = curr_fp - ((uint32_t)decoded << 16);
+        int16_t pcmFrameBuffer[960];
+        int totalDecodedSamples = 0;
 
-        // Bulk push data down onto RingBuffer payload safely in one operation
-        if (outSamplesCount > 0 && (size_t)outSamplesCount <= m_playerVoiceBuffer[playerSlot].CurrentFree())
+        // Calculate explicit size for Frame 1
+        int frame1Size = (frame2Start != -1) ? (frame2Start - frame1Start) : (end - frame1Start);
+
+        if (frame1Size > 0)
         {
-            m_playerVoiceBuffer[playerSlot].Push(resampledStaging, outSamplesCount);
+            //smutils->LogMessage(myself, "[VOICE-FIX] Frame 1 Determined Size: %d bytes. TOC: 0x%02X", frame1Size, frame1Ptr[0]);
+            
+            int samples1 = opus_decode(m_PlayerOpusDecoder[playerSlot],
+                                       frame1Ptr, frame1Size,
+                                       &pcmFrameBuffer[0], 480, 0);
+            
+            if (samples1 > 0) totalDecodedSamples += samples1;
+        }
+
+        // Process Frame 2 if a second TOC marker was identified
+        if (frame2Start != -1)
+        {
+            const unsigned char *frame2Ptr = p + frame2Start;
+            int frame2Size = end - frame2Start; // Frame 2 spans to the end of the payload buffer
+
+            if (frame2Size > 0)
+            {
+                //smutils->LogMessage(myself, "[VOICE-FIX] Frame 2 Determined Size: %d bytes. TOC: 0x%02X", frame2Size, frame2Ptr[0]);
+                
+                int samples2 = opus_decode(m_PlayerOpusDecoder[playerSlot],
+                                           frame2Ptr, frame2Size,
+                                           &pcmFrameBuffer[totalDecodedSamples], 480, 0);
+                
+                if (samples2 > 0) totalDecodedSamples += samples2;
+            }
+        }
+
+        int16_t resampledFrameBuffer[960];
+        int totalResampledSamples = 0;
+
+        if (totalDecodedSamples > 0)
+        {
+            // The exact conversion ratio factor is 147 / 160
+            // 480 samples at 24kHz converts precisely into 441 samples at 22.05kHz
+            // 960 samples (2 frames) converts precisely into 882 samples
+            totalResampledSamples = (totalDecodedSamples * 147) / 160;
+
+            for (int i = 0; i < totalResampledSamples; i++)
+            {
+                // Determine where this output sample lands on the input timeline
+                double srcPosition = (double)i * 160.0 / 147.0;
+                int srcIndex = (int)srcPosition;
+                double fraction = srcPosition - (double)srcIndex;
+
+                if (srcIndex + 1 < totalDecodedSamples)
+                {
+                    // Linear interpolate between the two closest matching input samples
+                    int16_t sampleA = pcmFrameBuffer[srcIndex];
+                    int16_t sampleB = pcmFrameBuffer[srcIndex + 1];
+                    resampledFrameBuffer[i] = (int16_t)(sampleA + fraction * (sampleB - sampleA));
+                }
+                else
+                {
+                    // Handle boundary edge case for the final trailing sample
+                    resampledFrameBuffer[i] = pcmFrameBuffer[srcIndex];
+                }
+            }
+        }
+        // --- PUSH SOUND TIMELINES SEQUENTIALLY ---
+        if (totalResampledSamples > 0)
+        {
+            size_t freeSpace = m_playerVoiceBuffer[playerSlot].CurrentFree();
+            if ((size_t)totalResampledSamples <= freeSpace)
+            {
+                // Push the perfectly timed 22050Hz stream to the CELT ring buffer
+                m_playerVoiceBuffer[playerSlot].Push(resampledFrameBuffer, totalResampledSamples);
+            }
         }
     }
 }
 
 void CVoice::HandlePlayerVoiceData(int playerSlot)
 {
-    const int CELT_FRAME_SIZE  = m_CeltEncoderSettings.FrameSize;
-    const int CELT_PACKET_SIZE = m_CeltEncoderSettings.PacketSize;
+    const int CELT_FRAME_SIZE  = m_CeltEncoderSettings.FrameSize;  
+    const int CELT_PACKET_SIZE = m_CeltEncoderSettings.PacketSize; 
     int maxClients = iserver->GetClientCount();
 
     if (!m_pCeltCodecPlayer[playerSlot])
@@ -1327,102 +1378,40 @@ void CVoice::HandlePlayerVoiceData(int playerSlot)
 
     size_t currentBufferLength = m_playerVoiceBuffer[playerSlot].TotalLength();
     if (currentBufferLength < (size_t)CELT_FRAME_SIZE)
-        return;
-
-    // Use Engine Time instead of System Time to stay perfectly in sync with server tickrates
-    double now = (double)gpGlobals->curtime;
-
-    double timeSinceLastVoice = now - g_fLastVoiceData[playerSlot + 1];
-    if (timeSinceLastVoice > 0.5) // Reduced to 500ms for responsiveness
     {
-        m_playerAvailableTime[playerSlot + 1] = 0.0;
-        
-        // Completely clear internal codec history matrices to ensure clean starts
-        if (m_pCeltCodecPlayer[playerSlot]) {
-            celt_encoder_ctl(m_pCeltCodecPlayer[playerSlot], CELT_RESET_STATE_REQUEST, NULL);
-        }
-        if (m_PlayerOpusDecoder[playerSlot]) {
-            opus_decoder_ctl(m_PlayerOpusDecoder[playerSlot], OPUS_RESET_STATE);
-        }
+        return; // Not enough data yet
     }
 
-    if (m_playerAvailableTime[playerSlot + 1] == 0.0)
+    // We have a solid block! Extract exactly one frame's worth of samples
+    int16_t celtInput[512];
+    if (!m_playerVoiceBuffer[playerSlot].Pop(celtInput, CELT_FRAME_SIZE))
     {
-        m_playerAvailableTime[playerSlot + 1] = now;
         return;
     }
 
-    double elapsed = now - m_playerAvailableTime[playerSlot + 1];
-    int framesToEmit = (int)(elapsed / m_CeltEncoderSettings.FrameTime);
+    // Run the encoder pass on our clean block
+    unsigned char celtPacket[64];
+    int celtBytes = celt_encode(m_pCeltCodecPlayer[playerSlot], celtInput,
+                                CELT_FRAME_SIZE, celtPacket, CELT_PACKET_SIZE);
 
-    // DYNAMIC JITTER BUFFER CATCH-UP
-    // If the server lag spikes and calculates a massive frame burst, check what is actually in the buffer.
-    // There's no point trying to emit 37 frames if the client has only sent 4 frames of real data!
-    int framesInRealBuffer = (int)(currentBufferLength / (size_t)CELT_FRAME_SIZE);
-
-    if (framesToEmit > framesInRealBuffer)
+    if (celtBytes > 0)
     {
-        framesToEmit = framesInRealBuffer;
-    }
-    //smutils->LogMessage(myself, "framesToEmit: %d", framesToEmit);
-
-    // Smooth-cap the maximum frames processed per server frame to avoid robotic bursts
-    // 4 frames = ~92ms of audio, which is an ideal ceiling for single-frame catchups.
-    if (framesToEmit > 4)
-    {
-        framesToEmit = 4;
-    }
-
-    if (framesToEmit <= 0)
-        return;
-
-    int framesProcessed = 0;
-
-    while (framesProcessed < framesToEmit &&
-           m_playerVoiceBuffer[playerSlot].TotalLength() >= (size_t)CELT_FRAME_SIZE)
-    {
-        int16_t celtInput[CELT_FRAME_SIZE];
-        if (!m_playerVoiceBuffer[playerSlot].Pop(celtInput, CELT_FRAME_SIZE))
+        for (int i = 0; i < maxClients; i++)
         {
-            break;
-        }
+            IClient *pToClient = iserver->GetClient(i);
+            if (!pToClient || !pToClient->IsConnected() || !pToClient->IsActive())
+                continue;
+            if (!g_bIsNoSteam[i + 1])
+                continue;
+            if (g_bClientMuted[i + 1][playerSlot + 1])
+                continue;
 
-        unsigned char celtPacket[CELT_PACKET_SIZE];
-        int celtBytes = celt_encode(m_pCeltCodecPlayer[playerSlot], celtInput,
-                                    CELT_FRAME_SIZE, celtPacket, CELT_PACKET_SIZE);
-        if (celtBytes > 0)
-        {
-            for (int i = 0; i < maxClients; i++)
-            {
-                IClient *pToClient = iserver->GetClient(i);
-                if (!pToClient || !pToClient->IsConnected() || !pToClient->IsActive())
-                    continue;
-                if (!g_bIsNoSteam[i + 1])
-                    continue;
-                if (g_bClientMuted[i + 1][playerSlot + 1])
-                    continue;
-                SendVoiceDataMsg(playerSlot, pToClient, celtPacket, celtBytes, 0);
-            }
+            SendVoiceDataMsg(playerSlot, pToClient, celtPacket, celtBytes, 0);
         }
-        else
-        {
-            smutils->LogError(myself, "HandlePlayerVoiceData: celt_encode failed: %d", celtBytes);
-            break;
-        }
-
-        framesProcessed++;
     }
-
-    // Advance our tracking clock safely based on what we processed
-    if (framesProcessed > 0)
+    else
     {
-        m_playerAvailableTime[playerSlot + 1] += (double)framesProcessed * m_CeltEncoderSettings.FrameTime;
-    }
-    else if (framesToEmit > 0)
-    {
-        // If we wanted to emit frames but the buffer was empty, pull the clock forward
-        // to 'now' so we don't build up a permanent structural timing lag.
-        m_playerAvailableTime[playerSlot + 1] = now;
+        smutils->LogError(myself, "HandlePlayerVoiceData: celt_encode failed: %d", celtBytes);
     }
 }
 
diff --git a/extension.h b/extension.h
index 9086e80..2fc74e8 100644
--- a/extension.h
+++ b/extension.h
@@ -137,7 +137,6 @@ public:  // IConCommandBaseAccessor
 public:
 	CVoice();
 	void OnGameFrame(bool simulating);
-	bool OnBroadcastVoiceData(IClient *pClient, int nBytes, char *data);
 
 	void ListenSocket();
     void PushPlayerVoiceData(int playerSlot, int nBytes, char *data);
@@ -174,7 +173,6 @@ private:
 
     // Player transcode state (24000 Hz mono -> 22050 Hz mono -> CELT)
     OpusDecoder *m_PlayerOpusDecoder[SM_MAXPLAYERS + 1];
-    int m_playerResampleAccum[SM_MAXPLAYERS + 1];
 
 	double m_AvailableTime;
     double m_playerAvailableTime[SM_MAXPLAYERS + 1];