Here is an example of how to tackle it, without worrying too much about the usage of CPU threads:
$ cat streams_solution.cu
#include <math.h>
#include <iostream>
#include <time.h>
#include <sys/time.h>
#include <stdio.h>
// modifiable
typedef float ft;
const int chunks = 64;
const int MAX_DEV = 8;
const size_t ds = 2ULL*1024*1024*chunks*MAX_DEV;
const int count = 22;
const int num_streams = 8;
// not modifiable
const float sqrt_2PIf = 2.5066282747946493232942230134974f;
const double sqrt_2PI = 2.5066282747946493232942230134974;
__device__ float gpdf(float val, float sigma) {
return expf(-0.5f * val * val) / (sigma * sqrt_2PIf);
}
__device__ double gpdf(double val, double sigma) {
return exp(-0.5 * val * val) / (sigma * sqrt_2PI);
}
// compute average gaussian pdf value over a window around each point
__global__ void gaussian_pdf(const ft * __restrict__ x, ft * __restrict__ y, const ft mean, const ft sigma, const int n) {
int idx = threadIdx.x + blockDim.x * blockIdx.x;
if (idx < n) {
ft in = x[idx] - (count / 2) * 0.01f;
ft out = 0;
for (int i = 0; i < count; i++) {
ft temp = (in - mean) / sigma;
out += gpdf(temp, sigma);
in += 0.01f;
}
y[idx] = out / count;
}
}
// error check macro
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
// host-based timing
#define USECPSEC 1000000ULL
unsigned long long dtime_usec(unsigned long long start) {
timeval tv;
gettimeofday(&tv, 0);
return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}
int main() {
ft *h_x, **d_x, *h_y, *h_y1, **d_y;
cudaHostAlloc(&h_x, ds*sizeof(ft), cudaHostAllocDefault);
cudaHostAlloc(&h_y, ds*sizeof(ft), cudaHostAllocDefault);
cudaHostAlloc(&h_y1, ds*sizeof(ft), cudaHostAllocDefault);
cudaCheckErrors("allocation error");
int num_dev;
cudaGetDeviceCount(&num_dev);
cudaCheckErrors("device count error");
if ((num_dev != 1) && (num_dev != 2) && (num_dev != 4) && (num_dev != 8) || (num_dev > MAX_DEV)) {std::cout << "invalid device count: " << num_dev << std::endl; return 0;}
cudaStream_t streams[MAX_DEV][num_streams];
for (int d = 0; d < num_dev; d++){
cudaSetDevice(d);
for (int i = 0; i < num_streams; i++)
cudaStreamCreate(&streams[d][i]);
}
cudaCheckErrors("stream creation error");
d_x = new ft *[num_dev];
d_y = new ft *[num_dev];
for (size_t i = 0; i < ds; i++) {
h_x[i] = rand() / (ft)RAND_MAX;
}
for (int d = 0; d < num_dev; d++){
cudaSetDevice(d);
cudaMalloc(d_x+d, ((d==0)?ds:(ds/num_dev))*sizeof(ft));
cudaMalloc(d_y+d, ((d==0)?ds:(ds/num_dev))*sizeof(ft));
}
cudaSetDevice(0);
gaussian_pdf<<<(ds + 255) / 256, 256>>>(d_x[0], d_y[0], 0.0, 1.0, ds); // warm-up
cudaDeviceSynchronize();
unsigned long long et1 = dtime_usec(0);
cudaMemcpy(d_x[0], h_x, ds * sizeof(ft), cudaMemcpyHostToDevice);
gaussian_pdf<<<(ds + 255) / 256, 256>>>(d_x[0], d_y[0], 0.0, 1.0, ds);
cudaMemcpy(h_y1, d_y[0], ds * sizeof(ft), cudaMemcpyDeviceToHost);
cudaCheckErrors("non-streams execution error");
et1 = dtime_usec(et1);
std::cout << "non-stream single-gpu elapsed time: " << et1/(float)USECPSEC << std::endl;
#ifdef USE_STREAMS
cudaMemset(d_y[0], 0, ds * sizeof(ft));
cudaDeviceSynchronize();
unsigned long long et = dtime_usec(0);
#pragma omp parallel for
for (int d = 0; d < num_dev; d++) {
cudaSetDevice(d);
for (int i = 0; i < chunks; i++) { //depth-first launch
cudaMemcpyAsync(d_x[d] + i * (ds / (chunks*num_dev)), h_x + i * (ds / (chunks*num_dev)) + d * (ds/num_dev), (ds / (chunks*num_dev)) * sizeof(ft), cudaMemcpyHostToDevice, streams[d][i % num_streams]);
gaussian_pdf<<<((ds / (chunks*num_dev)) + 255) / 256, 256, 0, streams[d][i % num_streams]>>>(d_x[d] + i * (ds / (chunks*num_dev)), d_y[d] + i * (ds / (chunks*num_dev)), 0.0, 1.0, ds / (chunks*num_dev));
cudaMemcpyAsync(h_y + i * (ds / (chunks*num_dev)) + d *(ds/num_dev), d_y[d] + i * (ds / (chunks*num_dev)), (ds / (chunks*num_dev)) * sizeof(ft), cudaMemcpyDeviceToHost, streams[d][i % num_streams]);
}
}
for (int d = 0; d < num_dev; d++){
cudaSetDevice(d);
cudaDeviceSynchronize();
}
cudaCheckErrors("streams execution error");
et = dtime_usec(et);
for (int i = 0; i < ds; i++) {
if (h_y[i] != h_y1[i]) {
std::cout << "mismatch at " << i << " was: " << h_y[i] << " should be: " << h_y1[i] << std::endl;
return -1;
}
}
std::cout << "streams elapsed time for " << num_dev << " gpus: " << et/(float)USECPSEC << std::endl;
#endif
return 0;
}
$ nvcc -o streams_solution streams_solution.cu -DUSE_STREAMS
$ CUDA_VISIBLE_DEVICES="0,1,2,3" ./streams_solution non-stream single-gpu elapsed time: 0.189018
streams elapsed time for 4 gpus: 0.045659
$
This doesn’t quite give me the speedup I would expect, so I may still have some things to work out. The next step would be to get out the profiler. Exercise left to the reader.
Later: system topology matters in the multi-GPU case. The system I am working on is sharing pairs of GPUs on each PCIE channel to a CPU. Therefore this will tend to reduce performance somewhat, when transfers in the same direction are queued up to two separate GPUs, compared to the “ideal” case of each GPU being fully independent.