I try to make a simple example to see how multi GPU work, in that for each GPU i simply allocate an global array, and fill it with the value as the id of that GPU. But to my surprise, it does not work at all. Can any one tell me what happens.
Thank a lot.
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <cutil.h>
#include <multithreading.h>
// includes, project
// Maximum number of CPU threads or GPUs.
#define MAX_CPU_THREAD 4
// Kernel configuration.
// use small number of blocks for device emulation to ensure we don't run too long.
#ifdef __DEVICE_EMULATION__
#define NUM_THREADS 256
#define NUM_BLOCKS 1
#else
#define NUM_THREADS 256
#define NUM_BLOCKS 1024*32
#endif
static int s_gpuCount = 0;
__global__ static void simple_kernel(float * g_idata, float c)
{
extern __shared__ float sdata[];
const int tid = threadIdx.x;
const int bid = blockIdx.x;
const int dim = blockDim.x;
g_idata[tid + dim * bid] = c;
}
float *odata;
static CUT_THREADPROC gpuThread1(int * device)
{
CUDA_SAFE_CALL(cudaSetDevice(*device));
const int mem_size = NUM_BLOCKS * NUM_THREADS * sizeof(float);
CUDA_SAFE_CALL(cudaMalloc( (void**) &odata, mem_size));
// @@ Copy some values to the buffers.
// Invoke kernel on this device.
simple_kernel<<<NUM_BLOCKS, NUM_THREADS>>>(odata, ((float) (*device)) + 1.0);
// @@ Get the results back.
float* temp = (float*) malloc(mem_size);
cudaMemcpy(temp, odata, mem_size, cudaMemcpyDeviceToHost);
for (unsigned int i=0; i< 20; ++i)
fprintf(stderr, "%f", temp[i]);
fprintf(stderr, "\n");
free(temp);
CUT_THREADEND;
}
static CUT_THREADPROC gpuTest(int * device)
{
CUDA_SAFE_CALL(cudaSetDevice(*device));
const int mem_size = NUM_BLOCKS * NUM_THREADS * sizeof(float);
float* temp = (float*) malloc(mem_size);
cudaMemcpy(temp, odata, mem_size, cudaMemcpyDeviceToHost);
for (unsigned int i=0; i< NUM_THREADS * NUM_BLOCKS; ++i)
if (temp[i] != (float)(*device)){
fprintf(stderr, "TEST FAILED %d %f \n ", i, temp[i]);
break;
}
CUT_THREADEND;
}
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main( int argc, char** argv)
{
CUT_DEVICE_INIT();
// Enumerate GPUs.
CUDA_SAFE_CALL(cudaGetDeviceCount(&s_gpuCount));
unsigned int timer = 0;
CUT_SAFE_CALL(cutCreateTimer(&timer));
// Cap the number of threads.
if (s_gpuCount > MAX_CPU_THREAD)
{
s_gpuCount = MAX_CPU_THREAD;
}
if (s_gpuCount == 0)
{
printf("No GPU found\n");
}
else if (s_gpuCount == 1)
{
printf("Only one GPU found\n");
CUT_SAFE_CALL(cutStartTimer(timer));
// Run a single thread.
int thread = 0;
gpuThread1(&thread);
CUT_SAFE_CALL(cutStopTimer(timer));
}
else
{
int threadIds[MAX_CPU_THREAD];
printf("%d GPUs found\n", s_gpuCount);
CUT_SAFE_CALL(cutStartTimer(timer));
CUTThread * threads = (CUTThread *)malloc(sizeof(CUTThread) * s_gpuCount);
// Start one thread for each device.
for(int i = 0; i < s_gpuCount; i++)
{
threadIds[i] = i;
threads[i] = cutStartThread((CUT_THREADROUTINE)gpuThread1, (void *)&threadIds[i]);
}
// Wait for all the threads to finish.
cutWaitForThreads(threads, s_gpuCount);
free(threads);
threads = (CUTThread *)malloc(sizeof(CUTThread) * s_gpuCount);
for(int i = 0; i < s_gpuCount; i++)
{
threadIds[i] = i;
threads[i] = cutStartThread((CUT_THREADROUTINE)gpuTest, (void *)&threadIds[i]);
}
// Wait for all the threads to finish.
cutWaitForThreads(threads, s_gpuCount);
free(threads);
CUT_SAFE_CALL(cutStopTimer(timer));
}
printf("Processing time: %f (ms)\n", cutGetTimerValue(timer));
CUT_SAFE_CALL(cutDeleteTimer(timer));
CUT_EXIT(argc, argv);
}