I am curious about why some program become faster when calling cudaStreamSynchronize during program running.
When I print the timeline, I found that the gap between CUDA operations is smaller when I add cudaStreamSynchronize
in the program.
Does anyone know why?
/* Includes, system */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <memory>
#include <vector>
#include <thread>
/* Includes, cuda */
#include <cublas_v2.h>
#include <cuda_runtime.h>
#include <nccl.h>
/* Matrix size */
#define N (256*8)
#define GPUS (2)
#define ITERATIONS (200)
float *d_A[GPUS];
float *d_B[GPUS];
float *d_C[GPUS];
int N2 = N * N;
enum NCCL_MODE {
ASYNC = 0,
SYNC = 1,
ONE_STREAM = 2,
};
std::unique_ptr<ncclComm_t[]> comms = nullptr;
std::unique_ptr<cudaStream_t[]> nccl_streams = nullptr;
std::unique_ptr<cudaStream_t[]> blas_streams = nullptr;
size_t timestamp() {
using namespace std::chrono;
return duration_cast<microseconds>(
high_resolution_clock::now().time_since_epoch()).count();
}
void init_nccl() {
comms.reset(new ncclComm_t[GPUS]);
nccl_streams.reset(new cudaStream_t[GPUS]);
blas_streams.reset(new cudaStream_t[GPUS]);
ncclUniqueId nccl_id;
ncclGetUniqueId(&nccl_id);
ncclGroupStart();
for (size_t i = 0; i < GPUS; ++i) {
cudaSetDevice(i);
cudaStreamCreate(nccl_streams.get()+i);
ncclCommInitRank(comms.get() + i, GPUS, nccl_id, i);
cudaStreamCreate(blas_streams.get()+i);
}
ncclGroupEnd();
}
int init_data(int dev) {
float *h_a;
float** dev_data [] = {&d_A[dev],&d_B[dev],&d_C[dev]};
size_t memory_size = N2 * sizeof(d_A[dev][0]);
cudaSetDevice(dev);
for (int i=0;i < 3; ++i){
h_a = reinterpret_cast<float *>(malloc(memory_size));
/* Fill the matrices with test data */
for (int j = 0; j < N2; j++) {
h_a[j] = rand() / static_cast<float>(RAND_MAX);;
}
/* Allocate device memory for the matrices */
if (cudaMalloc((void **)(dev_data[i]), memory_size) !=
cudaSuccess) {
fprintf(stderr, "!!!! device memory allocation error (allocate A)\n");
return EXIT_FAILURE;
}
cublasSetVector(N2, sizeof(float), h_a, 1, *(dev_data[i]), 1);
}
return 0;
}
int destroy_data(int dev) {
float* d_data[] = {d_A[dev], d_B[dev] ,d_C[dev] };
for (int i =0 ;i < 3; ++i) {
if (cudaFree(d_data[i]) != cudaSuccess) {
fprintf(stderr, "!!!! memory free error (A)\n");
return EXIT_FAILURE;
}
}
return 0;
}
/* Main */
int worker(int dev, int nccl_mode) {
float alpha = 1.0f;
float beta = 0.0f;
cublasStatus_t status;
cublasHandle_t handle;
auto &blas_stream = *(blas_streams.get() + dev);
cudaSetDevice(dev);
status = cublasCreate(&handle);
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf(stderr, "!!!! CUBLAS initialization error\n");
return EXIT_FAILURE;
}
cublasSetStream(handle, blas_stream);
/* Performs operation using cublas */
auto &nccl_stream = *(nccl_streams.get() + dev);
std::vector<cudaEvent_t> nccl_events;
nccl_events.reserve(ITERATIONS);
size_t start = timestamp();
if (nccl_mode == NCCL_MODE::ONE_STREAM) {
for (size_t i = 0; i <ITERATIONS; ++i) {
status = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N, &alpha, d_A[dev],
N, d_B[dev], N, &beta, d_C[dev], N);
ncclAllReduce(d_C[dev], d_C[dev], N2, ncclFloat, ncclSum, *(comms.get() + dev), blas_stream);
cudaStreamSynchronize(blas_stream);
}
cudaStreamSynchronize(blas_stream);
} else {
// nccl_mode is ASYNC_NCCL or SYNC_NCCL
for (size_t i = 0; i < ITERATIONS; ++i) {
if (i > 0) {
cudaStreamWaitEvent(blas_stream, nccl_events.back(), 0);
}
cudaEvent_t event;
cudaEventCreateWithFlags(&event, cudaEventDisableTiming);
status = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N, &alpha, d_A[dev],
N, d_B[dev], N, &beta, d_C[dev], N);
cudaEventRecord(event, blas_stream);
cudaStreamWaitEvent(nccl_stream, event, 0);
ncclAllReduce(d_C[dev], d_C[dev], N2, ncclFloat, ncclSum, *(comms.get() + dev), nccl_stream);
nccl_events.emplace_back();
cudaEventCreateWithFlags(&nccl_events.back(), cudaEventDisableTiming);
cudaEventRecord(nccl_events.back(), nccl_stream);
if (nccl_mode == SYNC) {
cudaStreamSynchronize(nccl_stream);
}
}
cudaStreamSynchronize(nccl_stream);
}
fprintf(stderr, "device: [%d], %d iterations spent: [%f ms]\n", dev, ITERATIONS, (timestamp()-start)/1000.0);
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf(stderr, "!!!! kernel execution error.\n");
return EXIT_FAILURE;
}
/* Shutdown */
status = cublasDestroy(handle);
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf(stderr, "!!!! shutdown error (A)\n");
return EXIT_FAILURE;
}
return 0;
}
int main(int argc, char** argv) {
init_nccl();
for (int i = 0; i < GPUS; ++i) {
init_data(i);
}
std::vector<std::thread> threads;
size_t start = timestamp();
int nccl_mode = NCCL_MODE::SYNC;
// int nccl_mode = NCCL_MODE::ASYNC;
// int nccl_mode =NCCL_MODE::ONE_STREAM;
for (int i = 0; i < GPUS; ++i) {
std::thread t(std::bind(&worker, i, nccl_mode));
threads.push_back(std::move(t));
}
for (auto &t : threads) {
t.join();
}
fprintf(stderr, "nccl mode: [%d], spent: [%F ms]\n", nccl_mode, (timestamp() - start)/1000.0);
for (int i = 0; i < GPUS; ++i) {
destroy_data(i);
}
return 0;
}
Another question, how can I display picture in this thread.