Simple multiGPU - Why is it failed Example to understand how multiGPU work

I try to make a simple example to see how multi GPU work, in that for each GPU i simply allocate an global array, and fill it with the value as the id of that GPU. But to my surprise, it does not work at all. Can any one tell me what happens.

Thank a lot.

#include <stdlib.h>

#include <stdio.h>

#include <string.h>

#include <cutil.h>

#include <multithreading.h>

// includes, project

// Maximum number of CPU threads or GPUs.

#define MAX_CPU_THREAD    4

// Kernel configuration.

// use small number of blocks for device emulation to ensure we don't run too long.

#ifdef __DEVICE_EMULATION__ 

#define NUM_THREADS    256

#define NUM_BLOCKS     1

#else

#define NUM_THREADS    256

#define NUM_BLOCKS    1024*32

#endif

static int s_gpuCount = 0;

__global__ static void simple_kernel(float * g_idata, float c)

{

    extern  __shared__  float sdata[];

   const int tid = threadIdx.x;

    const int bid = blockIdx.x;

    const int dim = blockDim.x;

   g_idata[tid + dim * bid] = c;

}

float *odata;

static CUT_THREADPROC gpuThread1(int * device)

{

    CUDA_SAFE_CALL(cudaSetDevice(*device));

   const int mem_size = NUM_BLOCKS * NUM_THREADS * sizeof(float);

    CUDA_SAFE_CALL(cudaMalloc( (void**) &odata, mem_size));

   // @@ Copy some values to the buffers.

    // Invoke kernel on this device.

    simple_kernel<<<NUM_BLOCKS, NUM_THREADS>>>(odata, ((float) (*device)) + 1.0);

   // @@ Get the results back.

    float* temp = (float*) malloc(mem_size);

    cudaMemcpy(temp, odata, mem_size, cudaMemcpyDeviceToHost);

    for (unsigned int i=0; i< 20; ++i)

        fprintf(stderr, "%f", temp[i]);

    fprintf(stderr, "\n");

   free(temp);

    CUT_THREADEND;

}

static CUT_THREADPROC gpuTest(int * device)

{

    CUDA_SAFE_CALL(cudaSetDevice(*device));

   const int mem_size = NUM_BLOCKS * NUM_THREADS * sizeof(float);

    float* temp = (float*) malloc(mem_size);

    cudaMemcpy(temp, odata, mem_size, cudaMemcpyDeviceToHost);

    for (unsigned int i=0; i< NUM_THREADS * NUM_BLOCKS; ++i)

        if (temp[i] != (float)(*device)){

            fprintf(stderr, "TEST FAILED %d %f \n ", i, temp[i]);

            break;

        }

    CUT_THREADEND;

}

////////////////////////////////////////////////////////////////////////////////

// Program main

////////////////////////////////////////////////////////////////////////////////

int main( int argc, char** argv)

{

    CUT_DEVICE_INIT();  

   // Enumerate GPUs.

    CUDA_SAFE_CALL(cudaGetDeviceCount(&s_gpuCount));

   unsigned int timer = 0;

    CUT_SAFE_CALL(cutCreateTimer(&timer));

   // Cap the number of threads.

    if (s_gpuCount > MAX_CPU_THREAD)

    {

        s_gpuCount = MAX_CPU_THREAD;

    }

   if (s_gpuCount == 0)

    {

        printf("No GPU found\n");

    }

    else if (s_gpuCount == 1)

    {

        printf("Only one GPU found\n");

       CUT_SAFE_CALL(cutStartTimer(timer));

       // Run a single thread.

        int thread = 0;

        gpuThread1(&thread);

        CUT_SAFE_CALL(cutStopTimer(timer));

    }

    else

    {

        int threadIds[MAX_CPU_THREAD];

       printf("%d GPUs found\n", s_gpuCount);

       CUT_SAFE_CALL(cutStartTimer(timer));

       CUTThread * threads = (CUTThread *)malloc(sizeof(CUTThread) * s_gpuCount);

       // Start one thread for each device.

        for(int i = 0; i < s_gpuCount; i++)

        {

            threadIds[i] = i;

            threads[i] = cutStartThread((CUT_THREADROUTINE)gpuThread1, (void *)&threadIds[i]);

                        

        }

       // Wait for all the threads to finish.

        cutWaitForThreads(threads, s_gpuCount);

        free(threads);

                

       threads = (CUTThread *)malloc(sizeof(CUTThread) * s_gpuCount);

        for(int i = 0; i < s_gpuCount; i++)

        {

            threadIds[i] = i;

            threads[i] = cutStartThread((CUT_THREADROUTINE)gpuTest, (void *)&threadIds[i]);

        }

        // Wait for all the threads to finish.

        cutWaitForThreads(threads, s_gpuCount);

        free(threads);

       CUT_SAFE_CALL(cutStopTimer(timer));

    }

   printf("Processing time: %f (ms)\n", cutGetTimerValue(timer));

    CUT_SAFE_CALL(cutDeleteTimer(timer));

    CUT_EXIT(argc, argv);

}

odata will NOT be different for every CUDA context, this works only for things you have to define at file scope (constant variables). So for every GPU you need a different odata pointer.
I’m sorry if this was unclear in my other post.

What do you mean by different data pointer. I try to modified my program like this

float *odata[2]

static CUT_THREADPROC gpuThread1(int * device)

{

    CUDA_SAFE_CALL(cudaSetDevice(*device));

   const int mem_size = NUM_BLOCKS * NUM_THREADS * sizeof(float);

    CUDA_SAFE_CALL(cudaMalloc( (void**) &odata[*device], mem_size));

   // @@ Copy some values to the buffers.

    // Invoke kernel on this device.

    simple_kernel<<<NUM_BLOCKS, NUM_THREADS>>>(odata[*device], ((float) (*device)) + 1.0);

   CUT_THREADEND;

}

static CUT_THREADPROC gpuTest(int * device)

{

    CUDA_SAFE_CALL(cudaSetDevice(*device));

   const int mem_size = NUM_BLOCKS * NUM_THREADS * sizeof(float);

    float* temp = (float*) malloc(mem_size);

    cudaMemcpy(temp, odata[*device], mem_size, cudaMemcpyDeviceToHost);

    for (unsigned int i=0; i< NUM_THREADS * NUM_BLOCKS; ++i)

        if (temp[i] != (float)(*device)){

            fprintf(stderr, "TEST FAILED %d %f \n ", i, temp[i]);

            break;

        }

    CUT_THREADEND;

}

but it still failed , it seem to me that when CUT_THREADEND all the memory related to that device was released. Is that right. When i run without threads, the global memory will be there until program exit, how can i do similar thing with multiGPU

Right, your CUDA context and all associated resources are destroyed when the thread exits.

Just have the worker thread copy the kernel’s output data device->host before exiting.

I have a program that have multiple iterations, should i reallocate the memory for each GPU everytime i run it, because the amount of memory to save all the state of the program is huge, i want to minimize the amount of transfer that i have to perform.

You could make your threads persistent. Start a thread for each GPU when your program starts and use them for every iteration until you’re done.

This adds a certain amount of complexity to your host code but should be a lot more efficient then allocating and transferring data for each iteration.

What do you mean by thread persistent. Right now the memory will be clean each time i finish the thread. How can i preserve the result without copying it back and ford GPU-CPU

By persistent I mean don’t finish the thread. Leave it alive and always use the same thread as long as you use the GPU.

Is there any simple way to synchronize between devices of CUDA GPUs so that i can perform persistent