What's the meaning of the output of model vad_multilingual_frame_marblenet?

174362510 · September 19, 2023, 2:06pm

Hardware - CPU
Operating System: Ubuntu
Riva Version: Nemo r1.20.0
Dear NVIDIA Nemo team,
I uses following code to do VAD detection with vad_multilingual_frame_marblenet:

from nemo.core.classes import IterableDataset
from nemo.core.neural_types import NeuralType, AudioSignal, LengthsType
import torch
from torch.utils.data import DataLoader

SAMPLE_RATE = 16000
vad_model = nemo_asr.models.EncDecFrameClassificationModel.from_pretrained('vad_multilingual_frame_marblenet')
vad_model.eval()
vad_model = vad_model.to(vad_model.device)

class AudioDataLayer(IterableDataset):
    @property
    def output_types(self):
        return {
            'audio_signal': NeuralType(('B', 'T'), AudioSignal(freq=self._sample_rate)),
            'a_sig_length': NeuralType(tuple('B'), LengthsType()),
        }

    def __init__(self, sample_rate):
        super().__init__()
        self._sample_rate = sample_rate
        self.output = True

    def __iter__(self):
        return self

    def __next__(self):
        if not self.output:
            raise StopIteration
        self.output = False
        return torch.as_tensor(self.signal, dtype=torch.float32), \
               torch.as_tensor(self.signal_shape, dtype=torch.int64)

    def set_signal(self, signal):
        self.signal = signal.astype(np.float32)/32768.
        self.signal_shape = self.signal.size
        self.output = True

    def __len__(self):
        return 1

data_layer = AudioDataLayer(sample_rate=cfg.train_ds.sample_rate)
data_loader = DataLoader(data_layer, batch_size=1, collate_fn=data_layer.collate_fn)

def infer_signal(model, signal):
    data_layer.set_signal(signal)
    batch = next(iter(data_loader))
    audio_signal, audio_signal_len = batch
    audio_signal, audio_signal_len = audio_signal.to(vad_model.device), audio_signal_len.to(vad_model.device)
    logits = model.forward(input_signal=audio_signal, input_signal_length=audio_signal_len)
    # what's the meaning of logits here?
    return logits

class RTFrameVAD:
    def __init__(self, model_definition,
                 threshold=0.5,
                 window_size=2, probs_num=3):
        self.vocab = list(model_definition['labels'])
        self.vocab.append('_')

        self.sr = model_definition['sample_rate']
        self.threshold = threshold
        self.window_size = window_size
        self.probs_num = probs_num
        self.probs = np.zeros(shape=self.probs_num,
                               dtype=np.float32)
        self.reset()

    def _decode(self, frame):
        if len(frame) > (0.02 * self.sr):
          logits = infer_signal(vad_model, frame).cpu().numpy()[0]
          decoded = self._greedy_decoder(
            self.threshold,
            logits,
            self.vocab
          )
          return decoded
        else:
          return []


    @torch.no_grad()
    def transcribe(self, frame=None): 
        unmerged = self._decode(frame) 
        return unmerged

    def reset(self):
        self.probs=np.zeros(shape=self.probs.shape, dtype=np.float32)

    def _greedy_decoder(self, threshold, logits, vocab):
        s = []
        if logits.shape[0]:
            probs = torch.softmax(torch.as_tensor(logits), dim=-1)
            for prob in probs:
              preds = 1 if prob[1].item() > threshold else 0
              item = [preds, str(vocab[preds]), prob[0].item(), prob[1].item(), str(prob)]
              s.append(item)
        return s

def online_inference(wave_file, WINDOW_SIZE=(0.1-0.02), PROBS_NUM = 3, threshold=0.5):
    RATE = 16000
    CHUNK_SIZE = int(WINDOW_SIZE*RATE)
    vad = RTFrameVAD(model_definition = {
                   'sample_rate': SAMPLE_RATE,
                   'AudioToMFCCPreprocessor': cfg.preprocessor,
                   'JasperEncoder': cfg.encoder,
                   'labels': cfg.labels
               },
               threshold=threshold,
               window_size=WINDOW_SIZE, probs_num = PROBS_NUM)

    wf = wave.open(wave_file, 'rb')
    p = pa.PyAudio()
    empty_counter = 0
    preds = []
    proba_b = []
    proba_s = []
    data = wf.readframes(CHUNK_SIZE)

    while len(data) > 0:
        data = wf.readframes(CHUNK_SIZE)
        signal = np.frombuffer(data, dtype=np.int16)
        result = vad.transcribe(signal)
        for item in result:
            preds.append(item[0])
            proba_b.append(item[2])
            proba_s.append(item[3])

    p.terminate()
    vad.reset()

    return preds, proba_b, proba_s

Then I call online_inference() with WINDOW_SIZE=0.4(seconds), the shape of logits output by vad_model I expected is [1, 20, 2], since this model provides a sequance of speech probabilities for each 20ms frame of the input audio and 400ms/20ms=20, but actually the shape of logits is [1, 21, 2], why is there an extra item in logits?
Another strange thing is, when I change the WINDOW_SIZE, from 0.4 to 1.0 as an example, I expected the first 20 items in logits should be the same as it is when WINDOW_SIZE=0.4, but where is no same item in these 2 outputs, does it mean that the number of audio frames in each batch affects the model’s VAD detection results for each frame?
Cloud you help to check the upper question?
Thank you very much!

TomNVIDIA · September 25, 2023, 5:36pm

Hi @174362510

I suggest posting this in the NeMo Github discussion area. NVIDIA/NeMo · Discussions · GitHub

You have a good chance of getting an answer from there.

Cheers,
Tom

Topic		Replies	Views
Error when starting Citrinet with language model Riva riva	6	800	October 12, 2021
Failed to convert Nemo model to Riva using nemo2riva for ASR Riva riva	1	40	January 24, 2025
Nv-Wavenet: Better Speech Synthesis Using GPU-Enabled WaveNet Inference Technical Blog	2	339	May 6, 2018
Failed to convert Nemo model to Riva (nemo2riva) - ASR Riva nemo	4	1154	May 31, 2023
Nvidia Nemo Intent model TensorRT	6	1471	October 17, 2023
Develop Smaller Speech Recognition Models with NVIDIA’s NeMo Framework Technical Blog	11	929	November 8, 2022
Audio input and output with noise cancellation DeepStream SDK deepstream	34	71	November 1, 2024
How to test riva VAD model separately Riva	5	748	May 24, 2024
VSS blueprint 2.2.0 - processing, percentage complete is 0.00 forever Visual AI Agent	8	68	March 6, 2025
Fine Tune the hind Nvidia Nemo Riva inception	25	1667	January 25, 2023

What's the meaning of the output of model vad_multilingual_frame_marblenet?

Related topics