Hardware - CPU
Operating System: Ubuntu
Riva Version: Nemo r1.20.0
Dear NVIDIA Nemo team,
I uses following code to do VAD detection with vad_multilingual_frame_marblenet:
from nemo.core.classes import IterableDataset
from nemo.core.neural_types import NeuralType, AudioSignal, LengthsType
import torch
from torch.utils.data import DataLoader
SAMPLE_RATE = 16000
vad_model = nemo_asr.models.EncDecFrameClassificationModel.from_pretrained('vad_multilingual_frame_marblenet')
vad_model.eval()
vad_model = vad_model.to(vad_model.device)
class AudioDataLayer(IterableDataset):
@property
def output_types(self):
return {
'audio_signal': NeuralType(('B', 'T'), AudioSignal(freq=self._sample_rate)),
'a_sig_length': NeuralType(tuple('B'), LengthsType()),
}
def __init__(self, sample_rate):
super().__init__()
self._sample_rate = sample_rate
self.output = True
def __iter__(self):
return self
def __next__(self):
if not self.output:
raise StopIteration
self.output = False
return torch.as_tensor(self.signal, dtype=torch.float32), \
torch.as_tensor(self.signal_shape, dtype=torch.int64)
def set_signal(self, signal):
self.signal = signal.astype(np.float32)/32768.
self.signal_shape = self.signal.size
self.output = True
def __len__(self):
return 1
data_layer = AudioDataLayer(sample_rate=cfg.train_ds.sample_rate)
data_loader = DataLoader(data_layer, batch_size=1, collate_fn=data_layer.collate_fn)
def infer_signal(model, signal):
data_layer.set_signal(signal)
batch = next(iter(data_loader))
audio_signal, audio_signal_len = batch
audio_signal, audio_signal_len = audio_signal.to(vad_model.device), audio_signal_len.to(vad_model.device)
logits = model.forward(input_signal=audio_signal, input_signal_length=audio_signal_len)
# what's the meaning of logits here?
return logits
class RTFrameVAD:
def __init__(self, model_definition,
threshold=0.5,
window_size=2, probs_num=3):
self.vocab = list(model_definition['labels'])
self.vocab.append('_')
self.sr = model_definition['sample_rate']
self.threshold = threshold
self.window_size = window_size
self.probs_num = probs_num
self.probs = np.zeros(shape=self.probs_num,
dtype=np.float32)
self.reset()
def _decode(self, frame):
if len(frame) > (0.02 * self.sr):
logits = infer_signal(vad_model, frame).cpu().numpy()[0]
decoded = self._greedy_decoder(
self.threshold,
logits,
self.vocab
)
return decoded
else:
return []
@torch.no_grad()
def transcribe(self, frame=None):
unmerged = self._decode(frame)
return unmerged
def reset(self):
self.probs=np.zeros(shape=self.probs.shape, dtype=np.float32)
def _greedy_decoder(self, threshold, logits, vocab):
s = []
if logits.shape[0]:
probs = torch.softmax(torch.as_tensor(logits), dim=-1)
for prob in probs:
preds = 1 if prob[1].item() > threshold else 0
item = [preds, str(vocab[preds]), prob[0].item(), prob[1].item(), str(prob)]
s.append(item)
return s
def online_inference(wave_file, WINDOW_SIZE=(0.1-0.02), PROBS_NUM = 3, threshold=0.5):
RATE = 16000
CHUNK_SIZE = int(WINDOW_SIZE*RATE)
vad = RTFrameVAD(model_definition = {
'sample_rate': SAMPLE_RATE,
'AudioToMFCCPreprocessor': cfg.preprocessor,
'JasperEncoder': cfg.encoder,
'labels': cfg.labels
},
threshold=threshold,
window_size=WINDOW_SIZE, probs_num = PROBS_NUM)
wf = wave.open(wave_file, 'rb')
p = pa.PyAudio()
empty_counter = 0
preds = []
proba_b = []
proba_s = []
data = wf.readframes(CHUNK_SIZE)
while len(data) > 0:
data = wf.readframes(CHUNK_SIZE)
signal = np.frombuffer(data, dtype=np.int16)
result = vad.transcribe(signal)
for item in result:
preds.append(item[0])
proba_b.append(item[2])
proba_s.append(item[3])
p.terminate()
vad.reset()
return preds, proba_b, proba_s
Then I call online_inference() with WINDOW_SIZE=0.4(seconds), the shape of logits output by vad_model I expected is [1, 20, 2], since this model provides a sequance of speech probabilities for each 20ms frame of the input audio and 400ms/20ms=20, but actually the shape of logits is [1, 21, 2], why is there an extra item in logits?
Another strange thing is, when I change the WINDOW_SIZE, from 0.4 to 1.0 as an example, I expected the first 20 items in logits should be the same as it is when WINDOW_SIZE=0.4, but where is no same item in these 2 outputs, does it mean that the number of audio frames in each batch affects the model’s VAD detection results for each frame?
Cloud you help to check the upper question?
Thank you very much!