# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. # TLT spec file for training the Conformer-CTC model for ASR using the AN4 dataset. trainer: # gpus: -1 # number of GPUs, -1 would use all available GPUs # num_nodes: 1 max_epochs: 1000 # max_steps: null # computed at runtime if not set # val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations # accelerator: ddp # accumulate_grad_batches: 1 # gradient_clip_val: 0.0 # precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP. # log_every_n_steps: 10 # Interval of logging. # progress_bar_refresh_rate: 10 # resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. # num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it # check_val_every_n_epoch: 1 # number of evaluations on validation every n epochs # sync_batchnorm: true # checkpoint_callback: true # Provided by exp_manager # logger: false # Provided by exp_manager tlt_checkpoint_interval: 1 #save_to: trained-model.tlt model: log_prediction: true # enables logging sample predictions in the output during training ctc_reduction: 'mean_batch' # recommend small vocab size of 128 or 256 when using 4x sub-sampling # you may find more detail on how to train a tokenizer at: /scripts/tokenizers/process_asr_text_tokenizer.py tokenizer: dir: /data/ # path to directory which contains either tokenizer.model (bpe) or vocab.txt (wpe) type: "bpe" # Can be either bpe (SentencePiece tokenizer) or wpe (WordPiece tokenizer) preprocessor: _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor sample_rate: 16000 normalize: "per_feature" window_size: 0.025 window_stride: 0.01 window: "hann" features: 80 n_fft: 512 log: true frame_splicing: 1 dither: 0.00001 pad_to: 0 pad_value: 0.0 spec_augment: _target_: nemo.collections.asr.modules.SpectrogramAugmentation freq_masks: 2 # set to zero to disable it # you may use lower time_masks for smaller models to have a faster convergence time_masks: 5 # set to zero to disable it freq_width: 27 time_width: 0.05 encoder: _target_: nemo.collections.asr.modules.ConformerEncoder feat_in: 80 feat_out: -1 # you may set it if you need different output size other than the default d_model n_layers: 16 d_model: 176 # Sub-sampling params subsampling: striding # vggnet or striding, vggnet may give better results but needs more memory subsampling_factor: 4 # must be power of 2 subsampling_conv_channels: -1 # -1 sets it to d_model # Feed forward module's params ff_expansion_factor: 4 # Multi-headed Attention Module's params self_attention_model: rel_pos # rel_pos or abs_pos n_heads: 4 # may need to be lower for smaller d_models # [left, right] specifies the number of steps to be seen from left and right of each step in self-attention att_context_size: [-1, -1] # -1 means unlimited context xscaling: true # scales up the input embeddings by sqrt(d_model) untie_biases: true # unties the biases of the TransformerXL layers pos_emb_max_len: 5000 # Convolution module's params conv_kernel_size: 31 conv_norm_type: 'batch_norm' # batch_norm or layer_norm ### regularization dropout: 0.1 # The dropout used in most of the Conformer Modules dropout_emb: 0.0 # The dropout used for embeddings dropout_att: 0.1 # The dropout for multi-headed attention modules decoder: _target_: nemo.collections.asr.modules.ConvASRDecoder feat_in: null num_classes: -1 vocabulary: [] training_ds: manifest_filepath: ??? sample_rate: 16000 batch_size: 16 # you may increase batch_size if your memory allows shuffle: true num_workers: 8 pin_memory: true use_start_end_token: false trim_silence: false max_duration: 16.7 # it is set for LibriSpeech, you may need to update it for your dataset min_duration: 0.1 # tarred datasets is_tarred: false tarred_audio_filepaths: null shuffle_n: 2048 # bucketing params bucketing_strategy: "synced_randomized" bucketing_batch_size: null validation_ds: manifest_filepath: ??? sample_rate: 16000 batch_size: 16 # you may increase batch_size if your memory allows shuffle: false num_workers: 8 pin_memory: true use_start_end_token: false optim: name: adamw lr: 5.0 # optimizer arguments betas: [0.9, 0.98] # less necessity for weight_decay as we already have large augmentations with SpecAug # you may need weight_decay for large models, stable AMP training, small datasets, or when lower augmentations are used # weight decay of 0.0 with lr of 2.0 also works fine weight_decay: 1e-3 # scheduler setup sched: name: NoamAnnealing d_model: ${model.encoder.d_model} # scheduler config override warmup_steps: 10000 warmup_ratio: null min_lr: 1e-6