I am trying to run Parabricks DeepVariant on some BAMs aligned to the UCSC hg19 reference. They had been aligned to the softmasked reference, but in my troubleshooting I made a BED file listing all the non-masked regions from the hardmasked reference, used that with --interval-file
and still had the problem. When the software reaches chr20 at around 60 megabases, I get the error:
[src/PBBgzfFile.cpp:893] Data already present in output., expected absent == 1, exiting.
Here is my bash script, for reference:
#!/bin/bash
#SBATCH --account=gpu-sids-sponsored
#SBATCH --partition=gpu-core-sponsored
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=24
#SBATCH --mem-per-cpu=7500M
#SBATCH --gpus=1
#SBATCH --time=0-03:00:00
#SBATCH --job-name=parabricks
#SBATCH --mail-type=ALL
#SBATCH --mail-user=Lindsay.Clark@seattlechildrens.org
#SBATCH --chdir=/data/hps/assoc/private/sids/user/lclar5/logs
CONTAINER=/data/hps/assoc/public/bioinformatics/container/rare-disease-wf/nvcr.io-nvidia-clara-clara-parabricks-4.4.0-1.img
INCSV=/data/hps/assoc/private/sids/user/lclar5/ramirez_2025-06_rare-disease-wf/samples/SIDS_bams_2025-06-25_test10.csv
OUTDIR=/data/hps/assoc/private/sids/user/lclar5/ramirez_2025-06_rare-disease-wf/results/gvcfs
FASTA=/data/hps/assoc/public/bioinformatics/annotations/Homo_sapiens/UCSC/hg19/Sequence/WholeGenomeFasta/hg19.fa
INTERVALS=/data/hps/assoc/private/sids/user/lclar5/ramirez_2025-06_rare-disease-wf/assets/hg19_chrom_nonmasked.bed
TMP=/data/hps/assoc/private/sids/user/lclar5/temp_parabricks
mkdir -p $OUTDIR
ROW=1
SAMPLE_ID=`head $INCSV -n $(($ROW + 1)) | tail -n 1 | cut -d , -f 3`
BAM=`head $INCSV -n $(($ROW + 1)) | tail -n 1 | cut -d , -f 4`
echo $SAMPLE_ID
echo $BAM
mkdir -p "$TMP/$SAMPLE_ID"
apptainer exec --bind /data/hps/assoc --nv \
$CONTAINER \
pbrun deepvariant \
--ref $FASTA \
--in-bam $BAM \
--out-variants "$OUTDIR/$SAMPLE_ID.g.vcf.gz" \
--interval-file $INTERVALS \
--num-gpus 1 \
--gvcf \
--tmp-dir "$TMP/$SAMPLE_ID"
rm "$OUTDIR/$SAMPLE_ID.vcf"