Hi all,
I’m trying to run a fairly complex kernel on multiple GPUs in parallel. Each run is independent and ideally I need them to run concurrently. So far, I haven’t been able to do that. Perhaps, I’m missing something in the setup and launch, and I’m attaching a very simple example that reproduces the problem. Kernel start and stop statements show that until device 0 is done, device 1 doesn’t start executing. According to Nvidia’s “Multi-GPU Programming” presentation, the syntax I’m using “should” work. The CUDA Programming Guide doesn’t have much on this subject, except a very basic syntax that doesn’t seem to work for me.
I use a server with 6 Tesla 2050 cards and CUDA 4.2.
The Nvidia supplied multi-GPU example (simpleMultiGPU) also runs sequentially, which makes me very suspicious. Running a kernel on multiple GPUs like this defeats the purpose of having several GPUs in the first place. Only one device is executing at any time and the rest are idle, which is clearly not ideal.
I’m hoping that the problem is on my end, and someone could suggest a way to get this to work.
Thanks in advance
Sasha
/* main.cpp */
#include "gpu.h"
int main(int argc, char* argv[]){
compute();
return 0;
}
/* gpu.h */
void compute();
/* gpu.cu */
#include <iostream>
#include "gpu.h"
#include <cuda_runtime.h>
#define THREADS 32
#define BLOCKS 1
#define DATA_SIZE 1600
__shared__ double data[DATA_SIZE];//give the kernel something to do
__global__ void kernel(int i, int device){
if (blockIdx.x == 0 && threadIdx.x == 0) printf("KERNEL %d STARTED ON DEVICE %d\n", i, device);
__syncthreads();
int pass = 0;
while (1){
int index = pass * THREADS * BLOCKS + threadIdx.x;
if (index >= DATA_SIZE) break;
data[index] = index * 2;
pass++;
}
if (blockIdx.x == 0 && threadIdx.x == 0) printf("KERNEL %d FINISHED ON DEVICE %d\n", i, device);
}
void compute(){
int devices = 0;
cudaGetDeviceCount(&devices);
cudaThreadSynchronize();
cudaError_t error = cudaGetLastError();
if (cudaSuccess != error){
std::cout << "Error in kernel = " << cudaGetErrorString(error) << std::endl;
}
printf("Detected %d devices\n", devices);
cudaStream_t* streams = new cudaStream_t[devices * 2];
int stream_counter = 0;
for (int i = 0; i < devices; i++){
cudaSetDevice(i);
for (int j = 0; j < 2; j++){
cudaStream_t s;
cudaStreamCreate(&s);
streams[stream_counter] = s;
std::cout << "Device " << i << ": Created stream " << s << " at index " << stream_counter << std::endl;
stream_counter++;
}
}
cudaSetDevice(0);
kernel<<<BLOCKS, THREADS, 0, streams[0]>>>(0, 0);
kernel<<<BLOCKS, THREADS, 0, streams[1]>>>(1, 0);
cudaSetDevice(1);
kernel<<<BLOCKS, THREADS, 0, streams[2]>>>(0, 1);
kernel<<<BLOCKS, THREADS, 0, streams[3]>>>(1, 1);
for (int i = 0; i < 4; i++){
cudaStreamSynchronize(streams[i]);
}
}