Distorted audio stream from python to stream player

I’m streaming chunks received rom openAI tts to the audio2face streaming player with a pyhton generator. It works, but the audio is distorted. The lipsync works. I dont get it… any idea?

def stream_audio_direct(response_stream, url, instance_name):
    print("Entry in stream_audio_direct function.")
    channel = grpc.insecure_channel(url)
    stub = audio2face_pb2_grpc.Audio2FaceStub(channel)
    samplerate = 24000
    # Define the chunk size based on the sample rate
    chunk_size = samplerate  // 10  # This will determine how much data is processed at once

    # Generator function to handle streaming
    def generate_audio_requests():
        # Send the start marker first
        start_marker = audio2face_pb2.PushAudioRequestStart(
            samplerate=samplerate,  # Use the given samplerate
            instance_name=instance_name,
            block_until_playback_is_finished=True
        )
        yield audio2face_pb2.PushAudioStreamRequest(start_marker=start_marker)

        # Buffer for audio data
        buffer = bytearray()
        for chunk in response_stream.iter_bytes():
            buffer += chunk
            # Process the buffer in chunks based on the calculated chunk size
            while len(buffer) >= chunk_size:
                # Adjust the buffer size to process only full chunks based on the audio format
                if len(buffer) % 4 != 0:
                    # Ensure we only process complete 32-bit float samples
                    process_size = len(buffer) - (len(buffer) % 4)
                else:
                    process_size = len(buffer)

                audio_data = np.frombuffer(buffer[:process_size], dtype=np.int16).astype(np.float32).tobytes()
                #audio_data = buffer[:process_size].astype(np.float32).tobytes()

                buffer = buffer[process_size:]

                yield audio2face_pb2.PushAudioStreamRequest(audio_data=audio_data)
                time.sleep(0.04)  # Introduce a delay between sending chunks

        # Handle any remaining buffer data
        if len(buffer) > 0:
            yield audio2face_pb2.PushAudioStreamRequest(audio_data=np.frombuffer(buffer, dtype=np.int16).astype(np.float32).tobytes())

    # Create the streaming call
    try:
        responses = stub.PushAudioStream(generate_audio_requests())
        for response in responses:
            print("Received response from server:", response)
    except Exception as e:
        print(f"An error occurred: {e}")```

Haaaahahahah i love to reply to my own question with the right answer!
I forgot to normalize before converting to byte array

audio_data = np.frombuffer(buffer[:process_size], dtype=np.int16).astype(np.float32) / 32768.0  
yield audio2face_pb2.PushAudioStreamRequest(audio_data=audio_data.tobytes())
2 Likes

@sebastian86 as another user, normalizing the audio was actually something i learned recently as well in another thread!

1 Like