Multi stream multi GPU

Hello, I have been trying to solve a CUDA workshop question - https://github.com/olcf/cuda-training-series/blob/master/exercises/hw10/readme.md#3-bonus-task---multi-gpu

I added cudaSetDevice calls when allocating the memory, as well as inside the omp loop.
I continue to run into illegal memory access, and can’t seem to figure out the appropriate way to divide this problem across two GPUs while using multiple streams. Can anyone please provide guidance?

Did you look at the solution code?

This the bonus problem. I have looked at the solution code. The solution code doesn’t tackle the multi-GPU scenario.

It’s unlikely that adding some cudaSetDevice() calls, only, is enough. More extensive refactoring is needed, probably.

You will need to think 2 dimensionally. Much of the activity will need to get done using doubly-nested for loops.

You will need a 2 dimensional array of created streams. One axis/dimension is the number of GPUs. The other axis/dimension is the number of streams (i.e. chunks) that will be issued to each GPU.

You will need to create allocations on all GPUs. The size is dictated by the number of chunks per GPU.

You will need a doubly-nested for loop for the work issuance. One loop nest iterates across devices, the other loop nest iterates across chunks per device.

If it were me, I would take a first crack at this without using OMP threads. The OMP threads are not essential to a well-worked solution. If you want to add OMP threads, the sensible way to do it (IMO) is to have one thread per GPU, not one thread per chunk as is indicated there. That makes the code refactoring even more difficult, if you try to use one thread per GPU, and a breadth-first launch. One thread per GPU with a depth-first launch should be easier, and is somewhat consistent with the existing solution code.

Here is an example of how to tackle it, without worrying too much about the usage of CPU threads:

$ cat streams_solution.cu
#include <math.h>
#include <iostream>
#include <time.h>
#include <sys/time.h>
#include <stdio.h>

// modifiable
typedef float ft;
const int chunks = 64;
const int MAX_DEV = 8;
const size_t ds = 2ULL*1024*1024*chunks*MAX_DEV;
const int count = 22;
const int num_streams = 8;
// not modifiable
const float sqrt_2PIf = 2.5066282747946493232942230134974f;
const double sqrt_2PI = 2.5066282747946493232942230134974;
__device__ float gpdf(float val, float sigma) {
  return expf(-0.5f * val * val) / (sigma * sqrt_2PIf);
}

__device__ double gpdf(double val, double sigma) {
  return exp(-0.5 * val * val) / (sigma * sqrt_2PI);
}

// compute average gaussian pdf value over a window around each point
__global__ void gaussian_pdf(const ft * __restrict__ x, ft * __restrict__ y, const ft mean, const ft sigma, const int n) {
  int idx = threadIdx.x + blockDim.x * blockIdx.x;
  if (idx < n) {
    ft in = x[idx] - (count / 2) * 0.01f;
    ft out = 0;
    for (int i = 0; i < count; i++) {
      ft temp = (in - mean) / sigma;
      out += gpdf(temp, sigma);
      in += 0.01f;
    }
    y[idx] = out / count;
  }
}

// error check macro
#define cudaCheckErrors(msg) \
  do { \
    cudaError_t __err = cudaGetLastError(); \
    if (__err != cudaSuccess) { \
        fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
            msg, cudaGetErrorString(__err), \
            __FILE__, __LINE__); \
        fprintf(stderr, "*** FAILED - ABORTING\n"); \
        exit(1); \
    } \
  } while (0)

// host-based timing
#define USECPSEC 1000000ULL

unsigned long long dtime_usec(unsigned long long start) {
  timeval tv;
  gettimeofday(&tv, 0);
  return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}

int main() {
  ft *h_x, **d_x, *h_y, *h_y1, **d_y;
  cudaHostAlloc(&h_x,  ds*sizeof(ft), cudaHostAllocDefault);
  cudaHostAlloc(&h_y,  ds*sizeof(ft), cudaHostAllocDefault);
  cudaHostAlloc(&h_y1, ds*sizeof(ft), cudaHostAllocDefault);
  cudaCheckErrors("allocation error");
  int num_dev;
  cudaGetDeviceCount(&num_dev);
  cudaCheckErrors("device count error");
  if ((num_dev != 1) && (num_dev != 2) && (num_dev != 4) && (num_dev != 8) || (num_dev > MAX_DEV)) {std::cout << "invalid device count: " << num_dev << std::endl; return 0;}
  cudaStream_t streams[MAX_DEV][num_streams];
  for (int d = 0; d < num_dev; d++){
    cudaSetDevice(d);
    for (int i = 0; i < num_streams; i++)
      cudaStreamCreate(&streams[d][i]);
  }
  cudaCheckErrors("stream creation error");

  d_x = new ft *[num_dev];
  d_y = new ft *[num_dev];
  for (size_t i = 0; i < ds; i++) {
    h_x[i] = rand() / (ft)RAND_MAX;
  }
  for (int d = 0; d < num_dev; d++){
    cudaSetDevice(d);
    cudaMalloc(d_x+d, ((d==0)?ds:(ds/num_dev))*sizeof(ft));
    cudaMalloc(d_y+d, ((d==0)?ds:(ds/num_dev))*sizeof(ft));
  }
  cudaSetDevice(0);
  gaussian_pdf<<<(ds + 255) / 256, 256>>>(d_x[0], d_y[0], 0.0, 1.0, ds); // warm-up
  cudaDeviceSynchronize();
  unsigned long long et1 = dtime_usec(0);

  cudaMemcpy(d_x[0], h_x, ds * sizeof(ft), cudaMemcpyHostToDevice);
  gaussian_pdf<<<(ds + 255) / 256, 256>>>(d_x[0], d_y[0], 0.0, 1.0, ds);
  cudaMemcpy(h_y1, d_y[0], ds * sizeof(ft), cudaMemcpyDeviceToHost);
  cudaCheckErrors("non-streams execution error");

  et1 = dtime_usec(et1);
  std::cout << "non-stream single-gpu elapsed time: " << et1/(float)USECPSEC << std::endl;

#ifdef USE_STREAMS
  cudaMemset(d_y[0], 0, ds * sizeof(ft));
  cudaDeviceSynchronize();
  unsigned long long et = dtime_usec(0);

#pragma omp parallel for
  for (int d = 0; d < num_dev; d++) {
    cudaSetDevice(d);
    for (int i = 0; i < chunks; i++) { //depth-first launch
      cudaMemcpyAsync(d_x[d] + i * (ds / (chunks*num_dev)), h_x + i * (ds / (chunks*num_dev)) + d * (ds/num_dev), (ds / (chunks*num_dev)) * sizeof(ft), cudaMemcpyHostToDevice, streams[d][i % num_streams]);
      gaussian_pdf<<<((ds / (chunks*num_dev)) + 255) / 256, 256, 0, streams[d][i % num_streams]>>>(d_x[d] + i * (ds / (chunks*num_dev)), d_y[d] + i * (ds / (chunks*num_dev)), 0.0, 1.0, ds / (chunks*num_dev));
      cudaMemcpyAsync(h_y + i * (ds / (chunks*num_dev)) + d *(ds/num_dev), d_y[d] + i * (ds / (chunks*num_dev)), (ds / (chunks*num_dev)) * sizeof(ft), cudaMemcpyDeviceToHost, streams[d][i % num_streams]);
    }
  }
  for (int d = 0; d < num_dev; d++){
    cudaSetDevice(d);
    cudaDeviceSynchronize();
  }
  cudaCheckErrors("streams execution error");

  et = dtime_usec(et);

  for (int i = 0; i < ds; i++) {
    if (h_y[i] != h_y1[i]) {
      std::cout << "mismatch at " << i << " was: " << h_y[i] << " should be: " << h_y1[i] << std::endl;
      return -1;
    }
  }

  std::cout << "streams elapsed time for " << num_dev << " gpus: " << et/(float)USECPSEC << std::endl;
#endif

  return 0;
}
$ nvcc -o streams_solution streams_solution.cu -DUSE_STREAMS
$ CUDA_VISIBLE_DEVICES="0,1,2,3" ./streams_solution   non-stream single-gpu elapsed time: 0.189018
streams elapsed time for 4 gpus: 0.045659
$

This doesn’t quite give me the speedup I would expect, so I may still have some things to work out. The next step would be to get out the profiler. Exercise left to the reader.

Later: system topology matters in the multi-GPU case. The system I am working on is sharing pairs of GPUs on each PCIE channel to a CPU. Therefore this will tend to reduce performance somewhat, when transfers in the same direction are queued up to two separate GPUs, compared to the “ideal” case of each GPU being fully independent.

Thanks for the hints. I will try to tackle this before coming to the solution code you shared.

So I followed your hints, and ended up pretty close to what you have. I keep getting the error “invalid argument”. This corresponds to the line cudaCheckErrors(“streams execution error”). I get this error even though I have commented out all the cuda memcpy calls as well as the kernel call. What am I missing?

int main() {
  ft *h_x, *h_y, *d_x[num_gpu], *d_y[num_gpu];
  cudaHostAlloc(&h_x,  ds*sizeof(ft), cudaHostAllocDefault);
  cudaHostAlloc(&h_y,  ds*sizeof(ft), cudaHostAllocDefault);

  for (size_t i = 0; i < ds; i++) {
    h_x[i] = rand() / (ft)RAND_MAX;
  }

  for (int d = 0; d < num_gpu; d++){
    cudaSetDevice(d);
    cudaMalloc(&d_x[d], (ds/num_gpu)*sizeof(ft));
    cudaMalloc(&d_y[d], (ds/num_gpu)*sizeof(ft));
  }

  cudaStream_t streams[num_gpu][num_streams/num_gpu];
  for (int d = 0; d < num_gpu; d++){
    cudaSetDevice(d);
    for (int i = 0; i < num_streams/num_gpu; i++)
      cudaStreamCreate(&streams[d][i]);
  }
  cudaCheckErrors("stream creation error");

#ifdef USE_STREAMS
  for (int d = 0; d < num_gpu; d++){
    cudaMemset(d_y[d], 0, ds/num_gpu * sizeof(ft));
    cudaDeviceSynchronize();
  }
  unsigned long long et = dtime_usec(0);

#pragma omp parallel for
  for (int d = 0; d < num_gpu; d++) {
    cudaSetDevice(d);
    for (int i = 0; i < chunks; i++) { //depth-first launch
      int j = i % (num_streams/num_gpu);
      // cudaMemcpyAsync(d_x[d] + i * (ds / (chunks*num_gpu)), h_x + i * (ds / (chunks*num_gpu)) + d * (ds/num_gpu), (ds / (chunks*num_gpu)) * sizeof(ft), cudaMemcpyHostToDevice, streams[d][j]);
      // gaussian_pdf<<<((ds / (chunks*num_gpu)) + 255) / 256, 256, 0, streams[d][j]>>>(d_x[d] + i * (ds / (chunks*num_gpu)), d_y[d] + i * (ds / (chunks*num_gpu)), 0.0, 1.0, ds / (chunks*num_gpu));
      // cudaMemcpyAsync(h_y + i * (ds / (chunks*num_gpu)) + d *(ds/num_gpu), d_y[d] + i * (ds / (chunks*num_gpu)), (ds / (chunks*num_gpu)) * sizeof(ft), cudaMemcpyDeviceToHost, streams[d][j]);
    }
  }
  for (int d = 0; d < num_gpu; d++){
    cudaSetDevice(d);
    cudaDeviceSynchronize();
  }
  cudaCheckErrors("streams execution error");

  et = dtime_usec(et);

  std::cout << "streams elapsed time: " << et/(float)USECPSEC << std::endl;
#endif

  return 0;
}

find out which API call exactly is giving the invalid argument error. Either sprinkle a cudaCheckErrors() call after every API call, or use compute-sanitizer, or both.

My guess would be you are missing a cudaSetDevice() in this loop:

  for (int d = 0; d < num_gpu; d++){
    cudaMemset(d_y[d], 0, ds/num_gpu * sizeof(ft));
    cudaDeviceSynchronize();
  }
1 Like

yeah that was it. Thanks for pointing that out. Got a solution time of 0.013559 (I have 2 GPUs on 1 node), compared to the single GPU case which gives me streams elapsed time: 0.026343

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.