I am curious about why some program become faster when calling cudaStreamSynchronize during program running.

I am curious about why some program become faster when calling cudaStreamSynchronize during program running.

When I print the timeline, I found that the gap between CUDA operations is smaller when I add cudaStreamSynchronize in the program.

Does anyone know why?

/* Includes, system */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <memory>
#include <vector>
#include <thread>

/* Includes, cuda */
#include <cublas_v2.h>
#include <cuda_runtime.h>
#include <nccl.h>

/* Matrix size */
#define N (256*8)
#define GPUS (2)
#define ITERATIONS (200)

float *d_A[GPUS];
float *d_B[GPUS];
float *d_C[GPUS];
int N2 = N * N;

enum NCCL_MODE {
  ASYNC = 0,
  SYNC = 1,
  ONE_STREAM = 2,
};

std::unique_ptr<ncclComm_t[]> comms = nullptr;
std::unique_ptr<cudaStream_t[]> nccl_streams = nullptr;
std::unique_ptr<cudaStream_t[]> blas_streams = nullptr;

size_t timestamp() {
  using namespace std::chrono;
  return duration_cast<microseconds>(
    high_resolution_clock::now().time_since_epoch()).count();
}

void init_nccl() {
  comms.reset(new ncclComm_t[GPUS]);
  nccl_streams.reset(new cudaStream_t[GPUS]);
  blas_streams.reset(new cudaStream_t[GPUS]);
  ncclUniqueId nccl_id;
  ncclGetUniqueId(&nccl_id);
  ncclGroupStart();
  for (size_t i = 0; i < GPUS; ++i) {
    cudaSetDevice(i);
    cudaStreamCreate(nccl_streams.get()+i);
    ncclCommInitRank(comms.get() + i, GPUS, nccl_id, i);
    cudaStreamCreate(blas_streams.get()+i);
  }
  ncclGroupEnd();
}

int init_data(int dev) {
  float *h_a;
  float** dev_data [] = {&d_A[dev],&d_B[dev],&d_C[dev]};
  size_t memory_size = N2 * sizeof(d_A[dev][0]);
  cudaSetDevice(dev);
  for (int i=0;i < 3; ++i){
    h_a = reinterpret_cast<float *>(malloc(memory_size));
    /* Fill the matrices with test data */
    for (int j = 0; j < N2; j++) {
      h_a[j] = rand() / static_cast<float>(RAND_MAX);;
    }

    /* Allocate device memory for the matrices */
    if (cudaMalloc((void **)(dev_data[i]), memory_size) !=
        cudaSuccess) {
      fprintf(stderr, "!!!! device memory allocation error (allocate A)\n");
      return EXIT_FAILURE;
    }
    cublasSetVector(N2, sizeof(float), h_a, 1, *(dev_data[i]), 1);
  }
  return 0;
}

int destroy_data(int dev) {
  float* d_data[] = {d_A[dev], d_B[dev] ,d_C[dev] };
  for (int i =0 ;i < 3; ++i) {
    if (cudaFree(d_data[i]) != cudaSuccess) {
      fprintf(stderr, "!!!! memory free error (A)\n");
      return EXIT_FAILURE;
    }
  }
  return 0;
}

/* Main */
int worker(int dev, int nccl_mode) {
  float alpha = 1.0f;
  float beta = 0.0f;
  cublasStatus_t status;
  cublasHandle_t handle;
  auto &blas_stream = *(blas_streams.get() + dev);
  cudaSetDevice(dev);

  status = cublasCreate(&handle);

  if (status != CUBLAS_STATUS_SUCCESS) {
    fprintf(stderr, "!!!! CUBLAS initialization error\n");
    return EXIT_FAILURE;
  }
  cublasSetStream(handle, blas_stream);

  /* Performs operation using cublas */
  auto &nccl_stream = *(nccl_streams.get() + dev);
  std::vector<cudaEvent_t> nccl_events;
  nccl_events.reserve(ITERATIONS);
  size_t start = timestamp();
  if (nccl_mode == NCCL_MODE::ONE_STREAM) {
    for (size_t i = 0; i <ITERATIONS; ++i) {
      status = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N, &alpha, d_A[dev],
                           N, d_B[dev], N, &beta, d_C[dev], N);
      ncclAllReduce(d_C[dev], d_C[dev], N2, ncclFloat, ncclSum, *(comms.get() + dev), blas_stream);
      cudaStreamSynchronize(blas_stream);
    }
    cudaStreamSynchronize(blas_stream);
  } else {
    // nccl_mode is ASYNC_NCCL or SYNC_NCCL
    for (size_t i = 0; i < ITERATIONS; ++i) {
      if (i > 0) {
        cudaStreamWaitEvent(blas_stream, nccl_events.back(), 0);
      }
      cudaEvent_t event;
      cudaEventCreateWithFlags(&event, cudaEventDisableTiming);

      status = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N, &alpha, d_A[dev],
                           N, d_B[dev], N, &beta, d_C[dev], N);

      cudaEventRecord(event, blas_stream);
      cudaStreamWaitEvent(nccl_stream, event, 0);

      ncclAllReduce(d_C[dev], d_C[dev], N2, ncclFloat, ncclSum, *(comms.get() + dev), nccl_stream);
      nccl_events.emplace_back();
      cudaEventCreateWithFlags(&nccl_events.back(), cudaEventDisableTiming);
      cudaEventRecord(nccl_events.back(), nccl_stream);
      if (nccl_mode == SYNC) {
        cudaStreamSynchronize(nccl_stream);
      }
    }
    cudaStreamSynchronize(nccl_stream);
  }
  fprintf(stderr, "device: [%d], %d iterations spent: [%f ms]\n", dev, ITERATIONS, (timestamp()-start)/1000.0);

  if (status != CUBLAS_STATUS_SUCCESS) {
    fprintf(stderr, "!!!! kernel execution error.\n");
    return EXIT_FAILURE;
  }

  /* Shutdown */
  status = cublasDestroy(handle);

  if (status != CUBLAS_STATUS_SUCCESS) {
    fprintf(stderr, "!!!! shutdown error (A)\n");
    return EXIT_FAILURE;
  }
  return 0;
}

int main(int argc, char** argv) {
  init_nccl();
  for (int i = 0; i < GPUS; ++i) {
    init_data(i);
  }
  std::vector<std::thread> threads;
  size_t start = timestamp();

  int nccl_mode = NCCL_MODE::SYNC;
  // int nccl_mode = NCCL_MODE::ASYNC;
  // int nccl_mode =NCCL_MODE::ONE_STREAM;
  for (int i = 0; i < GPUS; ++i) {
    std::thread t(std::bind(&worker, i, nccl_mode));
    threads.push_back(std::move(t));
  }
  for (auto &t : threads) {
    t.join();
  }

  fprintf(stderr, "nccl mode: [%d], spent: [%F ms]\n", nccl_mode, (timestamp() - start)/1000.0);

  for (int i = 0; i < GPUS; ++i) {
    destroy_data(i);
  }
  return 0;
}

Another question, how can I display picture in this thread.

Tell us something about the environment you are running this in. Windows, Linux or MacOS? If Windows, WDDM or TCC driver?

I’ll venture a guess: you are running on Windows with the driver in WDDM mode.

Regarding the second question: there is an image button in the icon bar above the text box to enter your posts, which produces an tag.

Thanks for your reply. The environment is Linux, and you can reproduce the phenomenon mentioned above in P40(CUDA8.0, NCCL 2.1) and V100(CUDA9.0, NCCL 2.3).

As you run the above code, when the nccl_mode is NCCL_MODE::SYNC, the output is :

device: [0], 200 iterations spent: [312.506000 ms]
device: [1], 200 iterations spent: [312.282000 ms]
nccl mode: [1], spent: [319.407000 ms]

When the nccl_mode is NCCL_MODE::ASYNC , the output is :

device: [1], 200 iterations spent: [349.681000 ms]
device: [0], 200 iterations spent: [349.931000 ms]
nccl mode: [0], spent: [367.414000 ms]

It is obvious that NCCL_MODE::SYNC is faster, And I have tried many times.

ENV:

System: Ubuntu 16.04
CUDA: 9.0
GPU: P40
NCCL: 2.1.2