What's the meaning of the output of model vad_multilingual_frame_marblenet?

Hardware - CPU
Operating System: Ubuntu
Riva Version: Nemo r1.20.0
Dear NVIDIA Nemo team,
I uses following code to do VAD detection with vad_multilingual_frame_marblenet:

from nemo.core.classes import IterableDataset
from nemo.core.neural_types import NeuralType, AudioSignal, LengthsType
import torch
from torch.utils.data import DataLoader

SAMPLE_RATE = 16000
vad_model = nemo_asr.models.EncDecFrameClassificationModel.from_pretrained('vad_multilingual_frame_marblenet')
vad_model.eval()
vad_model = vad_model.to(vad_model.device)

class AudioDataLayer(IterableDataset):
    @property
    def output_types(self):
        return {
            'audio_signal': NeuralType(('B', 'T'), AudioSignal(freq=self._sample_rate)),
            'a_sig_length': NeuralType(tuple('B'), LengthsType()),
        }

    def __init__(self, sample_rate):
        super().__init__()
        self._sample_rate = sample_rate
        self.output = True

    def __iter__(self):
        return self

    def __next__(self):
        if not self.output:
            raise StopIteration
        self.output = False
        return torch.as_tensor(self.signal, dtype=torch.float32), \
               torch.as_tensor(self.signal_shape, dtype=torch.int64)

    def set_signal(self, signal):
        self.signal = signal.astype(np.float32)/32768.
        self.signal_shape = self.signal.size
        self.output = True

    def __len__(self):
        return 1

data_layer = AudioDataLayer(sample_rate=cfg.train_ds.sample_rate)
data_loader = DataLoader(data_layer, batch_size=1, collate_fn=data_layer.collate_fn)

def infer_signal(model, signal):
    data_layer.set_signal(signal)
    batch = next(iter(data_loader))
    audio_signal, audio_signal_len = batch
    audio_signal, audio_signal_len = audio_signal.to(vad_model.device), audio_signal_len.to(vad_model.device)
    logits = model.forward(input_signal=audio_signal, input_signal_length=audio_signal_len)
    # what's the meaning of logits here?
    return logits

class RTFrameVAD:
    def __init__(self, model_definition,
                 threshold=0.5,
                 window_size=2, probs_num=3):
        self.vocab = list(model_definition['labels'])
        self.vocab.append('_')

        self.sr = model_definition['sample_rate']
        self.threshold = threshold
        self.window_size = window_size
        self.probs_num = probs_num
        self.probs = np.zeros(shape=self.probs_num,
                               dtype=np.float32)
        self.reset()

    def _decode(self, frame):
        if len(frame) > (0.02 * self.sr):
          logits = infer_signal(vad_model, frame).cpu().numpy()[0]
          decoded = self._greedy_decoder(
            self.threshold,
            logits,
            self.vocab
          )
          return decoded
        else:
          return []


    @torch.no_grad()
    def transcribe(self, frame=None): 
        unmerged = self._decode(frame) 
        return unmerged

    def reset(self):
        self.probs=np.zeros(shape=self.probs.shape, dtype=np.float32)

    def _greedy_decoder(self, threshold, logits, vocab):
        s = []
        if logits.shape[0]:
            probs = torch.softmax(torch.as_tensor(logits), dim=-1)
            for prob in probs:
              preds = 1 if prob[1].item() > threshold else 0
              item = [preds, str(vocab[preds]), prob[0].item(), prob[1].item(), str(prob)]
              s.append(item)
        return s

def online_inference(wave_file, WINDOW_SIZE=(0.1-0.02), PROBS_NUM = 3, threshold=0.5):
    RATE = 16000
    CHUNK_SIZE = int(WINDOW_SIZE*RATE)
    vad = RTFrameVAD(model_definition = {
                   'sample_rate': SAMPLE_RATE,
                   'AudioToMFCCPreprocessor': cfg.preprocessor,
                   'JasperEncoder': cfg.encoder,
                   'labels': cfg.labels
               },
               threshold=threshold,
               window_size=WINDOW_SIZE, probs_num = PROBS_NUM)

    wf = wave.open(wave_file, 'rb')
    p = pa.PyAudio()
    empty_counter = 0
    preds = []
    proba_b = []
    proba_s = []
    data = wf.readframes(CHUNK_SIZE)

    while len(data) > 0:
        data = wf.readframes(CHUNK_SIZE)
        signal = np.frombuffer(data, dtype=np.int16)
        result = vad.transcribe(signal)
        for item in result:
            preds.append(item[0])
            proba_b.append(item[2])
            proba_s.append(item[3])

    p.terminate()
    vad.reset()

    return preds, proba_b, proba_s

Then I call online_inference() with WINDOW_SIZE=0.4(seconds), the shape of logits output by vad_model I expected is [1, 20, 2], since this model provides a sequance of speech probabilities for each 20ms frame of the input audio and 400ms/20ms=20, but actually the shape of logits is [1, 21, 2], why is there an extra item in logits?
Another strange thing is, when I change the WINDOW_SIZE, from 0.4 to 1.0 as an example, I expected the first 20 items in logits should be the same as it is when WINDOW_SIZE=0.4, but where is no same item in these 2 outputs, does it mean that the number of audio frames in each batch affects the model’s VAD detection results for each frame?
Cloud you help to check the upper question?
Thank you very much!

Hi @174362510

I suggest posting this in the NeMo Github discussion area. NVIDIA/NeMo · Discussions · GitHub

You have a good chance of getting an answer from there.

Cheers,
Tom