I am curious about why some program become faster when calling cudaStreamSynchronize during program running.

shuimulinxi · January 7, 2019, 11:51am

When I print the timeline, I found that the gap between CUDA operations is smaller when I add cudaStreamSynchronize in the program.

Does anyone know why?

/* Includes, system */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <memory>
#include <vector>
#include <thread>

/* Includes, cuda */
#include <cublas_v2.h>
#include <cuda_runtime.h>
#include <nccl.h>

/* Matrix size */
#define N (256*8)
#define GPUS (2)
#define ITERATIONS (200)

float *d_A[GPUS];
float *d_B[GPUS];
float *d_C[GPUS];
int N2 = N * N;

enum NCCL_MODE {
  ASYNC = 0,
  SYNC = 1,
  ONE_STREAM = 2,
};

std::unique_ptr<ncclComm_t[]> comms = nullptr;
std::unique_ptr<cudaStream_t[]> nccl_streams = nullptr;
std::unique_ptr<cudaStream_t[]> blas_streams = nullptr;

size_t timestamp() {
  using namespace std::chrono;
  return duration_cast<microseconds>(
    high_resolution_clock::now().time_since_epoch()).count();
}

void init_nccl() {
  comms.reset(new ncclComm_t[GPUS]);
  nccl_streams.reset(new cudaStream_t[GPUS]);
  blas_streams.reset(new cudaStream_t[GPUS]);
  ncclUniqueId nccl_id;
  ncclGetUniqueId(&nccl_id);
  ncclGroupStart();
  for (size_t i = 0; i < GPUS; ++i) {
    cudaSetDevice(i);
    cudaStreamCreate(nccl_streams.get()+i);
    ncclCommInitRank(comms.get() + i, GPUS, nccl_id, i);
    cudaStreamCreate(blas_streams.get()+i);
  }
  ncclGroupEnd();
}

int init_data(int dev) {
  float *h_a;
  float** dev_data [] = {&d_A[dev],&d_B[dev],&d_C[dev]};
  size_t memory_size = N2 * sizeof(d_A[dev][0]);
  cudaSetDevice(dev);
  for (int i=0;i < 3; ++i){
    h_a = reinterpret_cast<float *>(malloc(memory_size));
    /* Fill the matrices with test data */
    for (int j = 0; j < N2; j++) {
      h_a[j] = rand() / static_cast<float>(RAND_MAX);;
    }

    /* Allocate device memory for the matrices */
    if (cudaMalloc((void **)(dev_data[i]), memory_size) !=
        cudaSuccess) {
      fprintf(stderr, "!!!! device memory allocation error (allocate A)\n");
      return EXIT_FAILURE;
    }
    cublasSetVector(N2, sizeof(float), h_a, 1, *(dev_data[i]), 1);
  }
  return 0;
}

int destroy_data(int dev) {
  float* d_data[] = {d_A[dev], d_B[dev] ,d_C[dev] };
  for (int i =0 ;i < 3; ++i) {
    if (cudaFree(d_data[i]) != cudaSuccess) {
      fprintf(stderr, "!!!! memory free error (A)\n");
      return EXIT_FAILURE;
    }
  }
  return 0;
}

/* Main */
int worker(int dev, int nccl_mode) {
  float alpha = 1.0f;
  float beta = 0.0f;
  cublasStatus_t status;
  cublasHandle_t handle;
  auto &blas_stream = *(blas_streams.get() + dev);
  cudaSetDevice(dev);

  status = cublasCreate(&handle);

  if (status != CUBLAS_STATUS_SUCCESS) {
    fprintf(stderr, "!!!! CUBLAS initialization error\n");
    return EXIT_FAILURE;
  }
  cublasSetStream(handle, blas_stream);

  /* Performs operation using cublas */
  auto &nccl_stream = *(nccl_streams.get() + dev);
  std::vector<cudaEvent_t> nccl_events;
  nccl_events.reserve(ITERATIONS);
  size_t start = timestamp();
  if (nccl_mode == NCCL_MODE::ONE_STREAM) {
    for (size_t i = 0; i <ITERATIONS; ++i) {
      status = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N, &alpha, d_A[dev],
                           N, d_B[dev], N, &beta, d_C[dev], N);
      ncclAllReduce(d_C[dev], d_C[dev], N2, ncclFloat, ncclSum, *(comms.get() + dev), blas_stream);
      cudaStreamSynchronize(blas_stream);
    }
    cudaStreamSynchronize(blas_stream);
  } else {
    // nccl_mode is ASYNC_NCCL or SYNC_NCCL
    for (size_t i = 0; i < ITERATIONS; ++i) {
      if (i > 0) {
        cudaStreamWaitEvent(blas_stream, nccl_events.back(), 0);
      }
      cudaEvent_t event;
      cudaEventCreateWithFlags(&event, cudaEventDisableTiming);

      status = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N, &alpha, d_A[dev],
                           N, d_B[dev], N, &beta, d_C[dev], N);

      cudaEventRecord(event, blas_stream);
      cudaStreamWaitEvent(nccl_stream, event, 0);

      ncclAllReduce(d_C[dev], d_C[dev], N2, ncclFloat, ncclSum, *(comms.get() + dev), nccl_stream);
      nccl_events.emplace_back();
      cudaEventCreateWithFlags(&nccl_events.back(), cudaEventDisableTiming);
      cudaEventRecord(nccl_events.back(), nccl_stream);
      if (nccl_mode == SYNC) {
        cudaStreamSynchronize(nccl_stream);
      }
    }
    cudaStreamSynchronize(nccl_stream);
  }
  fprintf(stderr, "device: [%d], %d iterations spent: [%f ms]\n", dev, ITERATIONS, (timestamp()-start)/1000.0);

  if (status != CUBLAS_STATUS_SUCCESS) {
    fprintf(stderr, "!!!! kernel execution error.\n");
    return EXIT_FAILURE;
  }

  /* Shutdown */
  status = cublasDestroy(handle);

  if (status != CUBLAS_STATUS_SUCCESS) {
    fprintf(stderr, "!!!! shutdown error (A)\n");
    return EXIT_FAILURE;
  }
  return 0;
}

int main(int argc, char** argv) {
  init_nccl();
  for (int i = 0; i < GPUS; ++i) {
    init_data(i);
  }
  std::vector<std::thread> threads;
  size_t start = timestamp();

  int nccl_mode = NCCL_MODE::SYNC;
  // int nccl_mode = NCCL_MODE::ASYNC;
  // int nccl_mode =NCCL_MODE::ONE_STREAM;
  for (int i = 0; i < GPUS; ++i) {
    std::thread t(std::bind(&worker, i, nccl_mode));
    threads.push_back(std::move(t));
  }
  for (auto &t : threads) {
    t.join();
  }

  fprintf(stderr, "nccl mode: [%d], spent: [%F ms]\n", nccl_mode, (timestamp() - start)/1000.0);

  for (int i = 0; i < GPUS; ++i) {
    destroy_data(i);
  }
  return 0;
}

Another question, how can I display picture in this thread.

tera · January 7, 2019, 12:30pm

Tell us something about the environment you are running this in. Windows, Linux or MacOS? If Windows, WDDM or TCC driver?

I’ll venture a guess: you are running on Windows with the driver in WDDM mode.

Regarding the second question: there is an image button in the icon bar above the text box to enter your posts, which produces an tag.

shuimulinxi · January 8, 2019, 5:31am

Thanks for your reply. The environment is Linux, and you can reproduce the phenomenon mentioned above in P40(CUDA8.0, NCCL 2.1) and V100(CUDA9.0, NCCL 2.3).

shuimulinxi · January 21, 2019, 3:46pm

As you run the above code, when the nccl_mode is NCCL_MODE::SYNC, the output is :

device: [0], 200 iterations spent: [312.506000 ms]
device: [1], 200 iterations spent: [312.282000 ms]
nccl mode: [1], spent: [319.407000 ms]

When the nccl_mode is NCCL_MODE::ASYNC , the output is :

device: [1], 200 iterations spent: [349.681000 ms]
device: [0], 200 iterations spent: [349.931000 ms]
nccl mode: [0], spent: [367.414000 ms]

It is obvious that NCCL_MODE::SYNC is faster, And I have tried many times.

ENV:

System: Ubuntu 16.04
CUDA: 9.0
GPU: P40
NCCL: 2.1.2

Topic		Replies	Views
stream synchronize problem CUDA Programming and Performance	2	794	August 28, 2017
cudaThreadSynchronize() vs. cudaStreamSynchronize CUDA Programming and Performance	0	5588	January 19, 2010
Synchronization methods? CUDA Programming and Performance	11	2272	November 7, 2010
cudaStreamSynchronize is much slower than polling on a flag for kernel completion CUDA Programming and Performance cuda , synchronization	8	2444	February 16, 2023
cudaDeviceSynchronize 50x slower on TK1 Jetson TK1	2	1053	August 7, 2016
NCCL performing better with synchronization GPU-Accelerated Libraries nccl	0	512	April 3, 2024
Cuda synchronisation is very long CUDA Programming and Performance cuda	1	270	March 15, 2024
How to reduce the overhead from cudaStreamSynchronize? CUDA Programming and Performance	2	681	June 10, 2021
Why some synchronize function make cudaMemcpyAsync and kernal in different stream work in sequential CUDA Programming and Performance	2	6613	March 1, 2011
Why are kernel launches followed by synchronize so expensive? CUDA Programming and Performance	2	1505	July 2, 2012

I am curious about why some program become faster when calling cudaStreamSynchronize during program running.

Related topics