I am trying to implement RIVA ASR StreamingRecognition in my conversational AI application.
This is the asr client method:
def stream_transcribe_audio(channel, audio_chunk):
stub = riva_asr_pb2_grpc.RivaSpeechRecognitionStub(channel)
# Prepare the streaming recognition configuration
streaming_config = riva_asr_pb2.StreamingRecognitionConfig(
config=riva_asr_pb2.RecognitionConfig(
encoding=riva_audio_pb2.AudioEncoding.LINEAR_PCM,
sample_rate_hertz=16000,
language_code="en-US",
max_alternatives=1,
enable_automatic_punctuation=True,
),
)
# Create a generator for streaming requests
def request_generator(config, audio):
yield riva_asr_pb2.StreamingRecognizeRequest(streaming_config=config)
yield riva_asr_pb2.StreamingRecognizeRequest(audio_content=audio)
try:
responses = stub.StreamingRecognize(request_generator(streaming_config, audio_chunk))
print("Server response received, starting to process...")
except Exception as e:
print(f"Exception during streaming: {e}") # To log exceptions
return ""
transcript = ""
try:
for response in responses:
# Detailed logging of each response
print(f"Response received: {response}")
if hasattr(response, 'error') and response.error.code != 0:
print(f"Error in response: {response.error.message}")
break
if hasattr(response, 'results') and response.results:
result = response.results[0]
if result.is_final:
transcript_part = result.alternatives[0].transcript
print(f"Transcript part: {transcript_part}") # Log each transcript part
transcript += transcript_part + " "
except Exception as e:
print(f"Error in streaming transcription: {e}")
print("Detailed error information:")
print(type(e).__name__)
print(e.args)
print(f"Final Transcript: {transcript}") # Log the final transcript
return transcript
Script Response Logs:
Please speak...
Recording... Press Enter to stop.
Received audio chunk: 16384 bytes
ASRThread running, processing audio queue
Sending audio chunk of size 16384 to RIVA ASR
Server response received, starting to process...
Response received:
Response received:
Response received:
Response received:
Final Transcript:
Received audio chunk: 16384 bytes
ASRThread running, processing audio queue
Sending audio chunk of size 16384 to RIVA ASR
Server response received, starting to process...
Response received:
Response received:
Response received:
Response received: results {
alternatives {
transcript: "Hello. "
confidence: -1.613735556602478
}
is_final: true
channel_tag: 1
audio_processed: 0.6399999856948853
}
Transcript part: Hello.
Final Transcript: Hello.
Received audio chunk: 16384 bytes
ASRThread running, processing audio queue
Sending audio chunk of size 16384 to RIVA ASR
Server response received, starting to process...
Response received:
Response received:
Response received:
Response received: results {
alternatives {
transcript: "Can somebody "
confidence: -1.8710156679153442
}
is_final: true
channel_tag: 1
audio_processed: 0.6399999856948853
}
Transcript part: Can somebody
Final Transcript: Can somebody
Received audio chunk: 16384 bytes
ASRThread running, processing audio queue
Sending audio chunk of size 16384 to RIVA ASR
Server response received, starting to process...
Response received:
Response received:
Response received:
Response received: results {
alternatives {
transcript: "You hear me right. "
confidence: -1.8283002376556396
}
is_final: true
channel_tag: 1
audio_processed: 0.6399999856948853
}
Transcript part: You hear me right.
Final Transcript: You hear me right.
Received audio chunk: 16384 bytes
ASRThread running, processing audio queue
Sending audio chunk of size 16384 to RIVA ASR
Server response received, starting to process...
Response received:
Response received:
Response received:
Response received: results {
alternatives {
transcript: "Right now. "
confidence: -1.4808837175369263
}
is_final: true
channel_tag: 1
audio_processed: 0.6399999856948853
}
Transcript part: Right now.
Final Transcript: Right now.
Received audio chunk: 16384 bytes
ASRThread running, processing audio queue
Sending audio chunk of size 16384 to RIVA ASR
Server response received, starting to process...
Response received:
Response received:
Response received:
Response received:
Final Transcript:
Received audio chunk: 16384 bytes
Stopped recording.
Received transcription:
Please speak...
Recording... Press Enter to stop.
Received audio chunk: 16384 bytes
Stopped recording.
Server Logs:
I1120 11:02:22.489441 5775 grpc_riva_asr.cc:1579] ASRService.StreamingRecognize called.
I1120 11:02:22.489544 5775 grpc_riva_asr.cc:1794] Using model conformer-en-US-asr-streaming from Triton localhost:8001
I1120 11:02:22.489671 5775 riva_asr_stream.cc:226] Detected format: encoding = 1 RAW numchannels = 1 samplerate = 16000 bitspersample = 16
I1120 11:02:22.570063 5775 grpc_riva_asr.cc:1910] ASRService.StreamingRecognize returning OK
I1120 11:02:22.570289 5775 stats_builder.h:100] {"specversion":"1.0","type":"riva.asr.streamingrecognize.v1","source":"","subject":"","id":"e9010209-07b6-41bd-93a6-5c938bc28d03","datacontenttype":"application/json","time":"2023-11-20T11:02:22.489424783+00:00","data":{"release_version":"2.13.0","customer_uuid":"","ngc_org":"","ngc_team":"","ngc_org_team":"","container_uuid":"","language_code":"en-US","request_count":1,"audio_duration":0.5120000243186951,"speech_duration":0.0,"status":0,"err_msg":""}}
I1120 11:02:23.008147 5798 grpc_riva_asr.cc:1579] ASRService.StreamingRecognize called.
I1120 11:02:23.008957 5798 grpc_riva_asr.cc:1794] Using model conformer-en-US-asr-streaming from Triton localhost:8001
I1120 11:02:23.009198 5798 riva_asr_stream.cc:226] Detected format: encoding = 1 RAW numchannels = 1 samplerate = 16000 bitspersample = 16
E1120 11:02:23.088802 11998 normalize.cc:150] Class hints empty.
E1120 11:02:23.088826 11998 normalize.cc:156] normalizer failed to apply speechhints 'hello'. Returning output unchanged.
I1120 11:02:23.120199 5798 grpc_riva_asr.cc:1910] ASRService.StreamingRecognize returning OK
I1120 11:02:23.120492 5798 stats_builder.h:100] {"specversion":"1.0","type":"riva.asr.streamingrecognize.v1","source":"","subject":"","id":"1bad763d-67ac-4a75-ab1e-512c972f509b","datacontenttype":"application/json","time":"2023-11-20T11:02:23.008127535+00:00","data":{"release_version":"2.13.0","customer_uuid":"","ngc_org":"","ngc_team":"","ngc_org_team":"","container_uuid":"","language_code":"en-US","request_count":1,"audio_duration":0.5120000243186951,"speech_duration":0.0,"status":0,"err_msg":""}}
I1120 11:02:23.513077 5775 grpc_riva_asr.cc:1579] ASRService.StreamingRecognize called.
I1120 11:02:23.513692 5775 grpc_riva_asr.cc:1794] Using model conformer-en-US-asr-streaming from Triton localhost:8001
I1120 11:02:23.513990 5775 riva_asr_stream.cc:226] Detected format: encoding = 1 RAW numchannels = 1 samplerate = 16000 bitspersample = 16
E1120 11:02:23.567021 12001 normalize.cc:150] Class hints empty.
E1120 11:02:23.567047 12001 normalize.cc:156] normalizer failed to apply speechhints 'can somebody'. Returning output unchanged.
I1120 11:02:23.578464 5775 grpc_riva_asr.cc:1910] ASRService.StreamingRecognize returning OK
I1120 11:02:23.578678 5775 stats_builder.h:100] {"specversion":"1.0","type":"riva.asr.streamingrecognize.v1","source":"","subject":"","id":"0d9c4423-bcc4-4f9d-ac4a-02641344ca5d","datacontenttype":"application/json","time":"2023-11-20T11:02:23.513060427+00:00","data":{"release_version":"2.13.0","customer_uuid":"","ngc_org":"","ngc_team":"","ngc_org_team":"","container_uuid":"","language_code":"en-US","request_count":1,"audio_duration":0.5120000243186951,"speech_duration":0.0,"status":0,"err_msg":""}}
I1120 11:02:24.033721 5798 grpc_riva_asr.cc:1579] ASRService.StreamingRecognize called.
I1120 11:02:24.034348 5798 grpc_riva_asr.cc:1794] Using model conformer-en-US-asr-streaming from Triton localhost:8001
I1120 11:02:24.034597 5798 riva_asr_stream.cc:226] Detected format: encoding = 1 RAW numchannels = 1 samplerate = 16000 bitspersample = 16
E1120 11:02:24.088775 12004 normalize.cc:150] Class hints empty.
E1120 11:02:24.088806 12004 normalize.cc:156] normalizer failed to apply speechhints 'you hear me right'. Returning output unchanged.
I1120 11:02:24.100689 5798 grpc_riva_asr.cc:1910] ASRService.StreamingRecognize returning OK
I1120 11:02:24.100970 5798 stats_builder.h:100] {"specversion":"1.0","type":"riva.asr.streamingrecognize.v1","source":"","subject":"","id":"980d827d-c5cc-4322-b078-bcf5739baf5f","datacontenttype":"application/json","time":"2023-11-20T11:02:24.033702957+00:00","data":{"release_version":"2.13.0","customer_uuid":"","ngc_org":"","ngc_team":"","ngc_org_team":"","container_uuid":"","language_code":"en-US","request_count":1,"audio_duration":0.5120000243186951,"speech_duration":0.0,"status":0,"err_msg":""}}
I1120 11:02:24.602605 5798 grpc_riva_asr.cc:1579] ASRService.StreamingRecognize called.
I1120 11:02:24.603236 5798 grpc_riva_asr.cc:1794] Using model conformer-en-US-asr-streaming from Triton localhost:8001
I1120 11:02:24.603442 5798 riva_asr_stream.cc:226] Detected format: encoding = 1 RAW numchannels = 1 samplerate = 16000 bitspersample = 16
E1120 11:02:24.656375 12007 normalize.cc:150] Class hints empty.
E1120 11:02:24.656402 12007 normalize.cc:156] normalizer failed to apply speechhints 'right now'. Returning output unchanged.
I1120 11:02:24.669127 5798 grpc_riva_asr.cc:1910] ASRService.StreamingRecognize returning OK
I1120 11:02:24.669392 5798 stats_builder.h:100] {"specversion":"1.0","type":"riva.asr.streamingrecognize.v1","source":"","subject":"","id":"34d678f4-4524-4bbe-98c5-07522fe99dd8","datacontenttype":"application/json","time":"2023-11-20T11:02:24.602589539+00:00","data":{"release_version":"2.13.0","customer_uuid":"","ngc_org":"","ngc_team":"","ngc_org_team":"","container_uuid":"","language_code":"en-US","request_count":1,"audio_duration":0.5120000243186951,"speech_duration":0.0,"status":0,"err_msg":""}}
I1120 11:02:25.062372 5798 grpc_riva_asr.cc:1579] ASRService.StreamingRecognize called.
I1120 11:02:25.063171 5798 grpc_riva_asr.cc:1794] Using model conformer-en-US-asr-streaming from Triton localhost:8001
I1120 11:02:25.063496 5798 riva_asr_stream.cc:226] Detected format: encoding = 1 RAW numchannels = 1 samplerate = 16000 bitspersample = 16
I1120 11:02:25.149184 5798 grpc_riva_asr.cc:1910] ASRService.StreamingRecognize returning OK
I1120 11:02:25.149407 5798 stats_builder.h:100] {"specversion":"1.0","type":"riva.asr.streamingrecognize.v1","source":"","subject":"","id":"87d549be-8b77-491b-9083-528b21971f9b","datacontenttype":"application/json","time":"2023-11-20T11:02:25.062354469+00:00","data":{"release_version":"2.13.0","customer_uuid":"","ngc_org":"","ngc_team":"","ngc_org_team":"","container_uuid":"","language_code":"en-US","request_count":1,"audio_duration":0.5120000243186951,"speech_duration":0.0,"status":0,"err_msg":""}}
The microphone has a great quality and audio recording was tested and works correctly in the application!
I tried smaller chunk sizes than 8192 which yielded worse or no results.
Sample rate is set to 16000.
Still the confidence values for the received transcription responses seem to always be negative.
Recording Code:
def record_audio(self, asr_thread):
# Start recording in a separate thread
recording_thread = threading.Thread(target=self._record_audio_thread, args=(asr_thread,))
recording_thread.start()
input("Recording... Press Enter to stop.\n")
asr_thread.stop() # Signal the ASR thread to stop processing
recording_thread.join() # Wait for the recording thread to finish
def _record_audio_thread(self, asr_thread):
with MicrophoneStream(rate=16000, chunk=8192) as stream:
for audio_chunk in stream:
if not asr_thread.running:
break
asr_thread.audio_queue.put(audio_chunk)
print("Stopped recording.")
Chunk processing and call of streaming transcription client method:
class ASRThread(threading.Thread):
def __init__(self):
super().__init__()
self.channel = create_channel()
self.audio_queue = queue.Queue()
self.running = True
self.latest_transcription = ""
def run(self):
while self.running:
# Check if there are audio chunks and stream them
while not self.audio_queue.empty():
audio_chunk = self.audio_queue.get()
print("ASRThread running, processing audio queue") # To confirm thread is processing
# Ensure audio_chunk is raw audio bytes
self.stream_transcribe_audio(audio_chunk)
def stream_transcribe_audio(self, audio_chunk):
try:
print(f"Sending audio chunk of size {len(audio_chunk)} to RIVA ASR") # To check size of audio being sent
transcript = stream_transcribe_audio(self.channel, audio_chunk)
self.latest_transcription = transcript
except Exception as e:
print(f"Error during transcription: {e}")
def get_latest_transcription(self):
return self.latest_transcription
def stop(self):
self.running = False
I appreciate help to figure out how to set this up correctly, thanks!
Hardware - GPU RTX 4090
Hardware - CPU
Operating System Windows11
Riva Version 2.13.0
TLT Version (if relevant)
How to reproduce the issue ? (This is for errors. Please share the command and the detailed log here)