Megatron-LM distributed training error

I’m trying to use the library https://github.com/NVIDIA/Megatron-LM
When I run command

OMP_NUM_THREADS=10 bash scripts/pretrain_gpt2_distributed.sh

I got an error

> initializing model parallel cuda seeds on global rank 0, model parallel rank 0, and data parallel rank 0 with model parallel seed: 3952 and data parallel seed: 1234
configuring data
Traceback (most recent call last):
  File "pretrain_gpt2.py", line 625, in <module>
    main()
  File "pretrain_gpt2.py", line 569, in main
    args.eod_token = get_train_val_test_data(args)
  File "pretrain_gpt2.py", line 515, in get_train_val_test_data
Traceback (most recent call last):
Traceback (most recent call last):
  File "pretrain_gpt2.py", line 625, in <module>
  File "pretrain_gpt2.py", line 625, in <module>
    main()
  File "pretrain_gpt2.py", line 569, in main
    args.eod_token = get_train_val_test_data(args)
  File "pretrain_gpt2.py", line 515, in get_train_val_test_data
Traceback (most recent call last):
Traceback (most recent call last):
    main()
    args)
  File "/home/ubuntu/Megatron-LM/configure_data.py", line 34, in apply
    return make_loaders(args)
  File "/home/ubuntu/Megatron-LM/configure_data.py", line 170, in make_loaders
    train, tokenizer = data_utils.make_dataset(**data_set_args)
  File "/home/ubuntu/Megatron-LM/data_utils/__init__.py", line 114, in make_dataset
    ds = [GPT2Dataset(d, max_seq_len=seq_length) if d is not None else None for d in ds]
  File "/home/ubuntu/Megatron-LM/data_utils/__init__.py", line 114, in <listcomp>
    args)
    ds = [GPT2Dataset(d, max_seq_len=seq_length) if d is not None else None for d in ds]
  File "/home/ubuntu/Megatron-LM/data_utils/datasets.py", line 477, in __init__
  File "/home/ubuntu/Megatron-LM/configure_data.py", line 34, in apply
    return make_loaders(args)
  File "/home/ubuntu/Megatron-LM/configure_data.py", line 170, in make_loaders
    train, tokenizer = data_utils.make_dataset(**data_set_args)
  File "/home/ubuntu/Megatron-LM/data_utils/__init__.py", line 114, in make_dataset
Traceback (most recent call last):
  File "pretrain_gpt2.py", line 625, in <module>
    ds = [GPT2Dataset(d, max_seq_len=seq_length) if d is not None else None for d in ds]
  File "/home/ubuntu/Megatron-LM/data_utils/__init__.py", line 114, in <listcomp>
    main()
  File "pretrain_gpt2.py", line 569, in main
    ds = [GPT2Dataset(d, max_seq_len=seq_length) if d is not None else None for d in ds]
  File "/home/ubuntu/Megatron-LM/data_utils/datasets.py", line 477, in __init__
    args.eod_token = get_train_val_test_data(args)
  File "pretrain_gpt2.py", line 515, in get_train_val_test_data
    self.init_weighting()
  File "/home/ubuntu/Megatron-LM/data_utils/datasets.py", line 487, in init_weighting
    self.weighting = list(accumulate(lens))
TypeError: iteration over a 0-d array
Traceback (most recent call last):
  File "pretrain_gpt2.py", line 625, in <module>
    main()
  File "pretrain_gpt2.py", line 569, in main
    args.eod_token = get_train_val_test_data(args)
  File "pretrain_gpt2.py", line 515, in get_train_val_test_data
    args)
  File "/home/ubuntu/Megatron-LM/configure_data.py", line 34, in apply
    return make_loaders(args)
  File "/home/ubuntu/Megatron-LM/configure_data.py", line 170, in make_loaders
    train, tokenizer = data_utils.make_dataset(**data_set_args)
  File "/home/ubuntu/Megatron-LM/data_utils/__init__.py", line 114, in make_dataset
    ds = [GPT2Dataset(d, max_seq_len=seq_length) if d is not None else None for d in ds]
  File "/home/ubuntu/Megatron-LM/data_utils/__init__.py", line 114, in <listcomp>
    ds = [GPT2Dataset(d, max_seq_len=seq_length) if d is not None else None for d in ds]
  File "/home/ubuntu/Megatron-LM/data_utils/datasets.py", line 477, in __init__
    self.init_weighting()
  File "/home/ubuntu/Megatron-LM/data_utils/datasets.py", line 487, in init_weighting
    self.weighting = list(accumulate(lens))
TypeError: iteration over a 0-d array
Traceback (most recent call last):
  File "pretrain_gpt2.py", line 625, in <module>
  File "pretrain_gpt2.py", line 569, in main
    args.eod_token = get_train_val_test_data(args)
  File "pretrain_gpt2.py", line 515, in get_train_val_test_data
    main()
  File "pretrain_gpt2.py", line 569, in main
    args)
  File "/home/ubuntu/Megatron-LM/configure_data.py", line 34, in apply
    return make_loaders(args)
  File "/home/ubuntu/Megatron-LM/configure_data.py", line 170, in make_loaders
    train, tokenizer = data_utils.make_dataset(**data_set_args)
  File "/home/ubuntu/Megatron-LM/data_utils/__init__.py", line 114, in make_dataset
    ds = [GPT2Dataset(d, max_seq_len=seq_length) if d is not None else None for d in ds]
  File "/home/ubuntu/Megatron-LM/data_utils/__init__.py", line 114, in <listcomp>
    args.eod_token = get_train_val_test_data(args)
  File "pretrain_gpt2.py", line 625, in <module>
  File "pretrain_gpt2.py", line 515, in get_train_val_test_data
    args)
    self.init_weighting()
  File "/home/ubuntu/Megatron-LM/data_utils/datasets.py", line 487, in init_weighting
  File "/home/ubuntu/Megatron-LM/configure_data.py", line 34, in apply
    return make_loaders(args)
    self.weighting = list(accumulate(lens))
TypeError: iteration over a 0-d array
  File "/home/ubuntu/Megatron-LM/configure_data.py", line 170, in make_loaders
    train, tokenizer = data_utils.make_dataset(**data_set_args)
  File "/home/ubuntu/Megatron-LM/data_utils/__init__.py", line 114, in make_dataset
    ds = [GPT2Dataset(d, max_seq_len=seq_length) if d is not None else None for d in ds]
  File "/home/ubuntu/Megatron-LM/data_utils/__init__.py", line 114, in <listcomp>
    ds = [GPT2Dataset(d, max_seq_len=seq_length) if d is not None else None for d in ds]
  File "/home/ubuntu/Megatron-LM/data_utils/datasets.py", line 477, in __init__
    self.init_weighting()
  File "/home/ubuntu/Megatron-LM/data_utils/datasets.py", line 487, in init_weighting
    self.weighting = list(accumulate(lens))
TypeError: iteration over a 0-d array
  File "pretrain_gpt2.py", line 625, in <module>
    main()
  File "pretrain_gpt2.py", line 569, in main
    args.eod_token = get_train_val_test_data(args)
  File "pretrain_gpt2.py", line 515, in get_train_val_test_data
    args)
  File "/home/ubuntu/Megatron-LM/configure_data.py", line 34, in apply
    return make_loaders(args)
  File "/home/ubuntu/Megatron-LM/configure_data.py", line 170, in make_loaders
    train, tokenizer = data_utils.make_dataset(**data_set_args)
  File "/home/ubuntu/Megatron-LM/data_utils/__init__.py", line 114, in make_dataset
    ds = [GPT2Dataset(d, max_seq_len=seq_length) if d is not None else None for d in ds]
  File "/home/ubuntu/Megatron-LM/data_utils/__init__.py", line 114, in <listcomp>
    ds = [GPT2Dataset(d, max_seq_len=seq_length) if d is not None else None for d in ds]
  File "/home/ubuntu/Megatron-LM/data_utils/datasets.py", line 477, in __init__
    self.init_weighting()
  File "/home/ubuntu/Megatron-LM/data_utils/datasets.py", line 487, in init_weighting
    self.weighting = list(accumulate(lens))
TypeError: iteration over a 0-d array
    args)
  File "/home/ubuntu/Megatron-LM/configure_data.py", line 34, in apply
    return make_loaders(args)
  File "/home/ubuntu/Megatron-LM/configure_data.py", line 170, in make_loaders
    train, tokenizer = data_utils.make_dataset(**data_set_args)
  File "/home/ubuntu/Megatron-LM/data_utils/__init__.py", line 114, in make_dataset
    ds = [GPT2Dataset(d, max_seq_len=seq_length) if d is not None else None for d in ds]
  File "/home/ubuntu/Megatron-LM/data_utils/__init__.py", line 114, in <listcomp>
    ds = [GPT2Dataset(d, max_seq_len=seq_length) if d is not None else None for d in ds]
  File "/home/ubuntu/Megatron-LM/data_utils/datasets.py", line 477, in __init__
    self.init_weighting()
  File "/home/ubuntu/Megatron-LM/data_utils/datasets.py", line 487, in init_weighting
    self.weighting = list(accumulate(lens))
TypeError: iteration over a 0-d array
    ds = [GPT2Dataset(d, max_seq_len=seq_length) if d is not None else None for d in ds]
  File "/home/ubuntu/Megatron-LM/data_utils/datasets.py", line 477, in __init__
    self.init_weighting()
  File "/home/ubuntu/Megatron-LM/data_utils/datasets.py", line 487, in init_weighting
    self.weighting = list(accumulate(lens))
TypeError: iteration over a 0-d array
    main()
  File "pretrain_gpt2.py", line 569, in main
    args.eod_token = get_train_val_test_data(args)
  File "pretrain_gpt2.py", line 515, in get_train_val_test_data
    args)
  File "/home/ubuntu/Megatron-LM/configure_data.py", line 34, in apply
    return make_loaders(args)
  File "/home/ubuntu/Megatron-LM/configure_data.py", line 170, in make_loaders
    train, tokenizer = data_utils.make_dataset(**data_set_args)
  File "/home/ubuntu/Megatron-LM/data_utils/__init__.py", line 114, in make_dataset
    ds = [GPT2Dataset(d, max_seq_len=seq_length) if d is not None else None for d in ds]
  File "/home/ubuntu/Megatron-LM/data_utils/__init__.py", line 114, in <listcomp>
    ds = [GPT2Dataset(d, max_seq_len=seq_length) if d is not None else None for d in ds]
  File "/home/ubuntu/Megatron-LM/data_utils/datasets.py", line 477, in __init__
    self.init_weighting()
  File "/home/ubuntu/Megatron-LM/data_utils/datasets.py", line 487, in init_weighting
    self.weighting = list(accumulate(lens))
TypeError: iteration over a 0-d array
Traceback (most recent call last):
  File "/usr/lib/python3.6/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/usr/lib/python3.6/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/home/ubuntu/Env/ml/lib/python3.6/site-packages/torch/distributed/launch.py", line 246, in <module>
    main()
  File "/home/ubuntu/Env/ml/lib/python3.6/site-packages/torch/distributed/launch.py", line 242, in main
    cmd=cmd)
subprocess.CalledProcessError: Command '['/home/ubuntu/Env/ml/bin/python', '-u', 'pretrain_gpt2.py', '--local_rank=7', '--num-layers', '24', '--hidden-size', '1024', '--num-attention-heads', '16', '--batch-size', '8', '--seq-length', '1024', '--max-position-embeddings', '1024', '--train-iters', '320000', '--save', 'checkpoints/gpt2_345m', '--load', 'checkpoints/gpt2_345m', '--resume-dataloader', '--train-data', 'wikipedia', '--lazy-loader', '--tokenizer-type', 'GPT2BPETokenizer', '--cache-dir', 'cache', '--split', '949,50,1', '--distributed-backend', 'nccl', '--lr', '0.00015', '--lr-decay-style', 'cosine', '--weight-decay', '1e-2', '--clip-grad', '1.0', '--warmup', '.01', '--checkpoint-activations', '--fp16']' returned non-zero exit status 1.

Could you help me to deal whith thi issue?