Simple multiGPU - Why is it failed Example to understand how multiGPU work

Linh_Ha · March 6, 2008, 2:32am

I try to make a simple example to see how multi GPU work, in that for each GPU i simply allocate an global array, and fill it with the value as the id of that GPU. But to my surprise, it does not work at all. Can any one tell me what happens.

Thank a lot.

#include <stdlib.h>

#include <stdio.h>

#include <string.h>

#include <cutil.h>

#include <multithreading.h>

// includes, project

// Maximum number of CPU threads or GPUs.

#define MAX_CPU_THREAD    4

// Kernel configuration.

// use small number of blocks for device emulation to ensure we don't run too long.

#ifdef __DEVICE_EMULATION__ 

#define NUM_THREADS    256

#define NUM_BLOCKS     1

#else

#define NUM_THREADS    256

#define NUM_BLOCKS    1024*32

#endif

static int s_gpuCount = 0;

__global__ static void simple_kernel(float * g_idata, float c)

{

    extern  __shared__  float sdata[];

   const int tid = threadIdx.x;

    const int bid = blockIdx.x;

    const int dim = blockDim.x;

   g_idata[tid + dim * bid] = c;

}

float *odata;

static CUT_THREADPROC gpuThread1(int * device)

{

    CUDA_SAFE_CALL(cudaSetDevice(*device));

   const int mem_size = NUM_BLOCKS * NUM_THREADS * sizeof(float);

    CUDA_SAFE_CALL(cudaMalloc( (void**) &odata, mem_size));

   // @@ Copy some values to the buffers.

    // Invoke kernel on this device.

    simple_kernel<<<NUM_BLOCKS, NUM_THREADS>>>(odata, ((float) (*device)) + 1.0);

   // @@ Get the results back.

    float* temp = (float*) malloc(mem_size);

    cudaMemcpy(temp, odata, mem_size, cudaMemcpyDeviceToHost);

    for (unsigned int i=0; i< 20; ++i)

        fprintf(stderr, "%f", temp[i]);

    fprintf(stderr, "\n");

   free(temp);

    CUT_THREADEND;

}

static CUT_THREADPROC gpuTest(int * device)

{

    CUDA_SAFE_CALL(cudaSetDevice(*device));

   const int mem_size = NUM_BLOCKS * NUM_THREADS * sizeof(float);

    float* temp = (float*) malloc(mem_size);

    cudaMemcpy(temp, odata, mem_size, cudaMemcpyDeviceToHost);

    for (unsigned int i=0; i< NUM_THREADS * NUM_BLOCKS; ++i)

        if (temp[i] != (float)(*device)){

            fprintf(stderr, "TEST FAILED %d %f \n ", i, temp[i]);

            break;

        }

    CUT_THREADEND;

}

////////////////////////////////////////////////////////////////////////////////

// Program main

////////////////////////////////////////////////////////////////////////////////

int main( int argc, char** argv)

{

    CUT_DEVICE_INIT();  

   // Enumerate GPUs.

    CUDA_SAFE_CALL(cudaGetDeviceCount(&s_gpuCount));

   unsigned int timer = 0;

    CUT_SAFE_CALL(cutCreateTimer(&timer));

   // Cap the number of threads.

    if (s_gpuCount > MAX_CPU_THREAD)

    {

        s_gpuCount = MAX_CPU_THREAD;

    }

   if (s_gpuCount == 0)

    {

        printf("No GPU found\n");

    }

    else if (s_gpuCount == 1)

    {

        printf("Only one GPU found\n");

       CUT_SAFE_CALL(cutStartTimer(timer));

       // Run a single thread.

        int thread = 0;

        gpuThread1(&thread);

        CUT_SAFE_CALL(cutStopTimer(timer));

    }

    else

    {

        int threadIds[MAX_CPU_THREAD];

       printf("%d GPUs found\n", s_gpuCount);

       CUT_SAFE_CALL(cutStartTimer(timer));

       CUTThread * threads = (CUTThread *)malloc(sizeof(CUTThread) * s_gpuCount);

       // Start one thread for each device.

        for(int i = 0; i < s_gpuCount; i++)

        {

            threadIds[i] = i;

            threads[i] = cutStartThread((CUT_THREADROUTINE)gpuThread1, (void *)&threadIds[i]);

                        

        }

       // Wait for all the threads to finish.

        cutWaitForThreads(threads, s_gpuCount);

        free(threads);

                

       threads = (CUTThread *)malloc(sizeof(CUTThread) * s_gpuCount);

        for(int i = 0; i < s_gpuCount; i++)

        {

            threadIds[i] = i;

            threads[i] = cutStartThread((CUT_THREADROUTINE)gpuTest, (void *)&threadIds[i]);

        }

        // Wait for all the threads to finish.

        cutWaitForThreads(threads, s_gpuCount);

        free(threads);

       CUT_SAFE_CALL(cutStopTimer(timer));

    }

   printf("Processing time: %f (ms)\n", cutGetTimerValue(timer));

    CUT_SAFE_CALL(cutDeleteTimer(timer));

    CUT_EXIT(argc, argv);

}

seb · March 6, 2008, 5:10am

odata will NOT be different for every CUDA context, this works only for things you have to define at file scope (constant variables). So for every GPU you need a different odata pointer.
I’m sorry if this was unclear in my other post.

Linh_Ha · March 6, 2008, 6:26am

What do you mean by different data pointer. I try to modified my program like this

float *odata[2]

static CUT_THREADPROC gpuThread1(int * device)

{

    CUDA_SAFE_CALL(cudaSetDevice(*device));

   const int mem_size = NUM_BLOCKS * NUM_THREADS * sizeof(float);

    CUDA_SAFE_CALL(cudaMalloc( (void**) &odata[*device], mem_size));

   // @@ Copy some values to the buffers.

    // Invoke kernel on this device.

    simple_kernel<<<NUM_BLOCKS, NUM_THREADS>>>(odata[*device], ((float) (*device)) + 1.0);

   CUT_THREADEND;

}

static CUT_THREADPROC gpuTest(int * device)

{

    CUDA_SAFE_CALL(cudaSetDevice(*device));

   const int mem_size = NUM_BLOCKS * NUM_THREADS * sizeof(float);

    float* temp = (float*) malloc(mem_size);

    cudaMemcpy(temp, odata[*device], mem_size, cudaMemcpyDeviceToHost);

    for (unsigned int i=0; i< NUM_THREADS * NUM_BLOCKS; ++i)

        if (temp[i] != (float)(*device)){

            fprintf(stderr, "TEST FAILED %d %f \n ", i, temp[i]);

            break;

        }

    CUT_THREADEND;

}

but it still failed , it seem to me that when CUT_THREADEND all the memory related to that device was released. Is that right. When i run without threads, the global memory will be there until program exit, how can i do similar thing with multiGPU

nwilt · March 6, 2008, 9:24am

Right, your CUDA context and all associated resources are destroyed when the thread exits.

Just have the worker thread copy the kernel’s output data device->host before exiting.

Linh_Ha · March 6, 2008, 4:27pm

I have a program that have multiple iterations, should i reallocate the memory for each GPU everytime i run it, because the amount of memory to save all the state of the program is huge, i want to minimize the amount of transfer that i have to perform.

seb · March 6, 2008, 4:51pm

You could make your threads persistent. Start a thread for each GPU when your program starts and use them for every iteration until you’re done.

This adds a certain amount of complexity to your host code but should be a lot more efficient then allocating and transferring data for each iteration.

Linh_Ha · March 6, 2008, 6:40pm

What do you mean by thread persistent. Right now the memory will be clean each time i finish the thread. How can i preserve the result without copying it back and ford GPU-CPU

seb · March 6, 2008, 6:56pm

By persistent I mean don’t finish the thread. Leave it alive and always use the same thread as long as you use the GPU.

Linh_Ha · March 6, 2008, 7:22pm

Is there any simple way to synchronize between devices of CUDA GPUs so that i can perform persistent

Topic		Replies	Views
A little help with Multi-GPU example please :) How do I pass data to each GPU? CUDA Programming and Performance	8	28003	March 4, 2012
Multiple GPUs Devise a synchro mechanism for host threads CUDA Programming and Performance	7	4192	May 13, 2010
MultiGPU start help CUDA Programming and Performance	8	10522	August 10, 2010
My first test on CUDA and some questions sync, thread with CUDA CUDA Programming and Performance	5	3021	November 13, 2007
Failure with independent devices on independent processes Try it yourself! CUDA Programming and Performance	19	3462	March 10, 2011
MultiGPUs newbie question Data transformation problem CUDA Programming and Performance	12	5152	March 18, 2008
CPU threads and CUDA CUDA Programming and Performance	8	7186	January 15, 2018
IDEA: Intrinsic multi-GPU support (Even over a network) CUDA Programming and Performance	7	9586	January 1, 2009
How to check work is done by different GPU in multi GPU environment CUDA Programming and Performance	8	3000	June 18, 2009
Multi stream multi GPU CUDA Programming and Performance cuda	9	1006	October 6, 2023

Simple multiGPU - Why is it failed Example to understand how multiGPU work

Related topics