Multi stream multi GPU

asterix_obelix · September 22, 2023, 1:34am

Hello, I have been trying to solve a CUDA workshop question - https://github.com/olcf/cuda-training-series/blob/master/exercises/hw10/readme.md#3-bonus-task---multi-gpu

I added cudaSetDevice calls when allocating the memory, as well as inside the omp loop.
I continue to run into illegal memory access, and can’t seem to figure out the appropriate way to divide this problem across two GPUs while using multiple streams. Can anyone please provide guidance?

Robert_Crovella · September 22, 2023, 3:26am

Did you look at the solution code?

asterix_obelix · September 22, 2023, 3:30am

This the bonus problem. I have looked at the solution code. The solution code doesn’t tackle the multi-GPU scenario.

Robert_Crovella · September 22, 2023, 4:02am

It’s unlikely that adding some cudaSetDevice() calls, only, is enough. More extensive refactoring is needed, probably.

You will need to think 2 dimensionally. Much of the activity will need to get done using doubly-nested for loops.

You will need a 2 dimensional array of created streams. One axis/dimension is the number of GPUs. The other axis/dimension is the number of streams (i.e. chunks) that will be issued to each GPU.

You will need to create allocations on all GPUs. The size is dictated by the number of chunks per GPU.

You will need a doubly-nested for loop for the work issuance. One loop nest iterates across devices, the other loop nest iterates across chunks per device.

If it were me, I would take a first crack at this without using OMP threads. The OMP threads are not essential to a well-worked solution. If you want to add OMP threads, the sensible way to do it (IMO) is to have one thread per GPU, not one thread per chunk as is indicated there. That makes the code refactoring even more difficult, if you try to use one thread per GPU, and a breadth-first launch. One thread per GPU with a depth-first launch should be easier, and is somewhat consistent with the existing solution code.

Robert_Crovella · September 22, 2023, 5:11am

Here is an example of how to tackle it, without worrying too much about the usage of CPU threads:

$ cat streams_solution.cu
#include <math.h>
#include <iostream>
#include <time.h>
#include <sys/time.h>
#include <stdio.h>

// modifiable
typedef float ft;
const int chunks = 64;
const int MAX_DEV = 8;
const size_t ds = 2ULL*1024*1024*chunks*MAX_DEV;
const int count = 22;
const int num_streams = 8;
// not modifiable
const float sqrt_2PIf = 2.5066282747946493232942230134974f;
const double sqrt_2PI = 2.5066282747946493232942230134974;
__device__ float gpdf(float val, float sigma) {
  return expf(-0.5f * val * val) / (sigma * sqrt_2PIf);
}

__device__ double gpdf(double val, double sigma) {
  return exp(-0.5 * val * val) / (sigma * sqrt_2PI);
}

// compute average gaussian pdf value over a window around each point
__global__ void gaussian_pdf(const ft * __restrict__ x, ft * __restrict__ y, const ft mean, const ft sigma, const int n) {
  int idx = threadIdx.x + blockDim.x * blockIdx.x;
  if (idx < n) {
    ft in = x[idx] - (count / 2) * 0.01f;
    ft out = 0;
    for (int i = 0; i < count; i++) {
      ft temp = (in - mean) / sigma;
      out += gpdf(temp, sigma);
      in += 0.01f;
    }
    y[idx] = out / count;
  }
}

// error check macro
#define cudaCheckErrors(msg) \
  do { \
    cudaError_t __err = cudaGetLastError(); \
    if (__err != cudaSuccess) { \
        fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
            msg, cudaGetErrorString(__err), \
            __FILE__, __LINE__); \
        fprintf(stderr, "*** FAILED - ABORTING\n"); \
        exit(1); \
    } \
  } while (0)

// host-based timing
#define USECPSEC 1000000ULL

unsigned long long dtime_usec(unsigned long long start) {
  timeval tv;
  gettimeofday(&tv, 0);
  return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}

int main() {
  ft *h_x, **d_x, *h_y, *h_y1, **d_y;
  cudaHostAlloc(&h_x,  ds*sizeof(ft), cudaHostAllocDefault);
  cudaHostAlloc(&h_y,  ds*sizeof(ft), cudaHostAllocDefault);
  cudaHostAlloc(&h_y1, ds*sizeof(ft), cudaHostAllocDefault);
  cudaCheckErrors("allocation error");
  int num_dev;
  cudaGetDeviceCount(&num_dev);
  cudaCheckErrors("device count error");
  if ((num_dev != 1) && (num_dev != 2) && (num_dev != 4) && (num_dev != 8) || (num_dev > MAX_DEV)) {std::cout << "invalid device count: " << num_dev << std::endl; return 0;}
  cudaStream_t streams[MAX_DEV][num_streams];
  for (int d = 0; d < num_dev; d++){
    cudaSetDevice(d);
    for (int i = 0; i < num_streams; i++)
      cudaStreamCreate(&streams[d][i]);
  }
  cudaCheckErrors("stream creation error");

  d_x = new ft *[num_dev];
  d_y = new ft *[num_dev];
  for (size_t i = 0; i < ds; i++) {
    h_x[i] = rand() / (ft)RAND_MAX;
  }
  for (int d = 0; d < num_dev; d++){
    cudaSetDevice(d);
    cudaMalloc(d_x+d, ((d==0)?ds:(ds/num_dev))*sizeof(ft));
    cudaMalloc(d_y+d, ((d==0)?ds:(ds/num_dev))*sizeof(ft));
  }
  cudaSetDevice(0);
  gaussian_pdf<<<(ds + 255) / 256, 256>>>(d_x[0], d_y[0], 0.0, 1.0, ds); // warm-up
  cudaDeviceSynchronize();
  unsigned long long et1 = dtime_usec(0);

  cudaMemcpy(d_x[0], h_x, ds * sizeof(ft), cudaMemcpyHostToDevice);
  gaussian_pdf<<<(ds + 255) / 256, 256>>>(d_x[0], d_y[0], 0.0, 1.0, ds);
  cudaMemcpy(h_y1, d_y[0], ds * sizeof(ft), cudaMemcpyDeviceToHost);
  cudaCheckErrors("non-streams execution error");

  et1 = dtime_usec(et1);
  std::cout << "non-stream single-gpu elapsed time: " << et1/(float)USECPSEC << std::endl;

#ifdef USE_STREAMS
  cudaMemset(d_y[0], 0, ds * sizeof(ft));
  cudaDeviceSynchronize();
  unsigned long long et = dtime_usec(0);

#pragma omp parallel for
  for (int d = 0; d < num_dev; d++) {
    cudaSetDevice(d);
    for (int i = 0; i < chunks; i++) { //depth-first launch
      cudaMemcpyAsync(d_x[d] + i * (ds / (chunks*num_dev)), h_x + i * (ds / (chunks*num_dev)) + d * (ds/num_dev), (ds / (chunks*num_dev)) * sizeof(ft), cudaMemcpyHostToDevice, streams[d][i % num_streams]);
      gaussian_pdf<<<((ds / (chunks*num_dev)) + 255) / 256, 256, 0, streams[d][i % num_streams]>>>(d_x[d] + i * (ds / (chunks*num_dev)), d_y[d] + i * (ds / (chunks*num_dev)), 0.0, 1.0, ds / (chunks*num_dev));
      cudaMemcpyAsync(h_y + i * (ds / (chunks*num_dev)) + d *(ds/num_dev), d_y[d] + i * (ds / (chunks*num_dev)), (ds / (chunks*num_dev)) * sizeof(ft), cudaMemcpyDeviceToHost, streams[d][i % num_streams]);
    }
  }
  for (int d = 0; d < num_dev; d++){
    cudaSetDevice(d);
    cudaDeviceSynchronize();
  }
  cudaCheckErrors("streams execution error");

  et = dtime_usec(et);

  for (int i = 0; i < ds; i++) {
    if (h_y[i] != h_y1[i]) {
      std::cout << "mismatch at " << i << " was: " << h_y[i] << " should be: " << h_y1[i] << std::endl;
      return -1;
    }
  }

  std::cout << "streams elapsed time for " << num_dev << " gpus: " << et/(float)USECPSEC << std::endl;
#endif

  return 0;
}
$ nvcc -o streams_solution streams_solution.cu -DUSE_STREAMS
$ CUDA_VISIBLE_DEVICES="0,1,2,3" ./streams_solution   non-stream single-gpu elapsed time: 0.189018
streams elapsed time for 4 gpus: 0.045659
$

This doesn’t quite give me the speedup I would expect, so I may still have some things to work out. The next step would be to get out the profiler. Exercise left to the reader.

Later: system topology matters in the multi-GPU case. The system I am working on is sharing pairs of GPUs on each PCIE channel to a CPU. Therefore this will tend to reduce performance somewhat, when transfers in the same direction are queued up to two separate GPUs, compared to the “ideal” case of each GPU being fully independent.

asterix_obelix · September 22, 2023, 2:02pm

Thanks for the hints. I will try to tackle this before coming to the solution code you shared.

asterix_obelix · September 22, 2023, 10:19pm

So I followed your hints, and ended up pretty close to what you have. I keep getting the error “invalid argument”. This corresponds to the line cudaCheckErrors(“streams execution error”). I get this error even though I have commented out all the cuda memcpy calls as well as the kernel call. What am I missing?

int main() {
  ft *h_x, *h_y, *d_x[num_gpu], *d_y[num_gpu];
  cudaHostAlloc(&h_x,  ds*sizeof(ft), cudaHostAllocDefault);
  cudaHostAlloc(&h_y,  ds*sizeof(ft), cudaHostAllocDefault);

  for (size_t i = 0; i < ds; i++) {
    h_x[i] = rand() / (ft)RAND_MAX;
  }

  for (int d = 0; d < num_gpu; d++){
    cudaSetDevice(d);
    cudaMalloc(&d_x[d], (ds/num_gpu)*sizeof(ft));
    cudaMalloc(&d_y[d], (ds/num_gpu)*sizeof(ft));
  }

  cudaStream_t streams[num_gpu][num_streams/num_gpu];
  for (int d = 0; d < num_gpu; d++){
    cudaSetDevice(d);
    for (int i = 0; i < num_streams/num_gpu; i++)
      cudaStreamCreate(&streams[d][i]);
  }
  cudaCheckErrors("stream creation error");

#ifdef USE_STREAMS
  for (int d = 0; d < num_gpu; d++){
    cudaMemset(d_y[d], 0, ds/num_gpu * sizeof(ft));
    cudaDeviceSynchronize();
  }
  unsigned long long et = dtime_usec(0);

#pragma omp parallel for
  for (int d = 0; d < num_gpu; d++) {
    cudaSetDevice(d);
    for (int i = 0; i < chunks; i++) { //depth-first launch
      int j = i % (num_streams/num_gpu);
      // cudaMemcpyAsync(d_x[d] + i * (ds / (chunks*num_gpu)), h_x + i * (ds / (chunks*num_gpu)) + d * (ds/num_gpu), (ds / (chunks*num_gpu)) * sizeof(ft), cudaMemcpyHostToDevice, streams[d][j]);
      // gaussian_pdf<<<((ds / (chunks*num_gpu)) + 255) / 256, 256, 0, streams[d][j]>>>(d_x[d] + i * (ds / (chunks*num_gpu)), d_y[d] + i * (ds / (chunks*num_gpu)), 0.0, 1.0, ds / (chunks*num_gpu));
      // cudaMemcpyAsync(h_y + i * (ds / (chunks*num_gpu)) + d *(ds/num_gpu), d_y[d] + i * (ds / (chunks*num_gpu)), (ds / (chunks*num_gpu)) * sizeof(ft), cudaMemcpyDeviceToHost, streams[d][j]);
    }
  }
  for (int d = 0; d < num_gpu; d++){
    cudaSetDevice(d);
    cudaDeviceSynchronize();
  }
  cudaCheckErrors("streams execution error");

  et = dtime_usec(et);

  std::cout << "streams elapsed time: " << et/(float)USECPSEC << std::endl;
#endif

  return 0;
}

Robert_Crovella · September 22, 2023, 10:33pm

find out which API call exactly is giving the invalid argument error. Either sprinkle a cudaCheckErrors() call after every API call, or use compute-sanitizer, or both.

My guess would be you are missing a cudaSetDevice() in this loop:

  for (int d = 0; d < num_gpu; d++){
    cudaMemset(d_y[d], 0, ds/num_gpu * sizeof(ft));
    cudaDeviceSynchronize();
  }

asterix_obelix · September 22, 2023, 10:37pm

yeah that was it. Thanks for pointing that out. Got a solution time of 0.013559 (I have 2 GPUs on 1 node), compared to the single GPU case which gives me streams elapsed time: 0.026343

system · October 6, 2023, 10:37pm

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.

Topic		Replies	Views
Time intervals and non-concurrent in multi streaming CUDA Programming and Performance cuda	6	593	April 6, 2023
Cuda code performance CUDA Programming and Performance	14	3177	December 16, 2014
Question on Stream, Connection and Performance CUDA Programming and Performance hw , cuda	6	1309	February 23, 2024
Can't get any concurrency on simple vector add across multi-GPU and streams CUDA Programming and Performance	17	5781	April 28, 2012
multi-GPUs with streams. Seems only one device overlapping copies CUDA Programming and Performance	9	1644	October 30, 2015
performance problem CUDA Programming and Performance	2	616	July 16, 2018
Asynchronous multi streaming: not working... CUDA Programming and Performance	2	519	May 13, 2018
streams in Multi-gpu system CUDA Programming and Performance	7	6148	May 23, 2017
Concurrent Kernel executions & Data Transfers CUDA Programming and Performance cuda	3	646	March 8, 2023
Cannot get any stream parallelism. CUDA Programming and Performance	13	1307	December 31, 2019

Multi stream multi GPU

Related topics