I am using nsys profile to measure the performance of my program. My program can run on 2 nodes, with 2 A100 GPUs each. However, it is stuck at the Collecting Data stage. I have also tried running some simple programs and programs that only run on a single node with nsys profile, and they were able to pass the Collecting Data stage.
The running command is “sh run.sh”. The program code in the run.sh file is:
#! /bin/bash
NODE_RANK=0
NNODES=2
GPUS_PER_NODE=2
MASTER_PORT=22
MASTER_ADDR=10.x.x.19
# ib ip = 10.4.9.19
# gpu6 ip = 10.254.46.19
DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'`
TENSORBOARD_PATH=./tensorboard/$DATETIME
config_json="deepspeed.json"
VOCAB_FILE=vocab.txt
# DATASET_1="/nfs/data001/001.txt_document_context"
DATASET_2="xxx/002.txt_document_context"
DATASET_3="xxx/003.txt_document_context"
DATASET_4="xxx/004.txt_document_context"
DATASET_5="xxx/005.txt_document_context"
DATASET_6="xxx/006.txt_document_context"
DATASET_7="xxx/007.txt_document_context"
DATASET_8="xxx/008.txt_document_context"
DATASET_9="xxx/009.txt_document_context"
DATASET_10="xxx/010.txt_document_context"
DATASET_11="xxx/011.txt_document_context"
DATASET_12="xxx/012.txt_document_context"
DATASET_13="xxx/013.txt_document_context"
DATASET_14="xxx/014.txt_document_context"
DATASET_15="xxx/015.txt_document_context"
DATASET_16="xxx/016.txt_document_context"
DATASET_17="xxx/017.txt_document_context"
DATASET_18="xxx/018.txt_document_context"
DATA_PATH=" \
0.1 ${DATASET_11} \
0.1 ${DATASET_2} \
0.1 ${DATASET_3} \
0.1 ${DATASET_4} \
0.1 ${DATASET_5} \
0.1 ${DATASET_6} \
0.1 ${DATASET_7} \
0.1 ${DATASET_8} \
0.1 ${DATASET_9} \
0.1 ${DATASET_10}"
gpt_options="\
--distributed-backend nccl \
--tokenizer-type EncDecTokenizer \
--optimizer lamb \
--lr-decay-style cosine \
--vocab-file $VOCAB_FILE \
--tensor-model-parallel-size 1 \
--num-layers 40 \
--train-samples 10024 \
--hidden-size 3072 \
--num-attention-heads 24 \
--seq-length 2048 \
--max-position-embeddings 2048 \
--micro-batch-size 1 \
--global-batch-size 8 \
--lr-warmup-samples 1024 \
--lr 5e-3 \
--min-lr 1e-05 \
--weight-decay 0.0005 \
--adam-beta1 0.9 \
--adam-beta2 0.95 \
--log-interval 1 \
--eval-iters -1 \
--data-path ${DATA_PATH} \
--save-interval 2000 \
--split 100,0,0 \
--init-method-std 0.002 \
--fp16 \
--DDP-impl local \
--checkpoint-num-layers 1 \
--log-num-zeros-in-grad \
--log-params-norm \
--tensorboard-dir $TENSORBOARD_PATH \
--tensorboard-log-interval 1 \
--num-workers 8 \
--pipeline-model-parallel-size 4
"
deepspeed_options=" \
--deepspeed \
--deepspeed_config ${config_json} \
--zero-stage 1 \
--deepspeed-activation-checkpointing
"
full_options="${gpt_options} ${deepspeed_options}"
nsys profile -w true deepspeed --hostfile="xxx/hostfile" --num_nodes ${NNODES} --num_gpus ${GPUS_PER_NODE} ./pretrain_gpt.py ${full_options}