Multi GPU not working as expected - please comment

Hi,

I hope anybody reads this and has an idea what I am doing wrong or why the piece of code is not working under Linux.

I have a Fedore 9 (64bit) machine running with 2 GTX280s which are nicely detected. But when running a multi-GPU program I get the same performance as a single GPU gives me.

Here’s the code:

[codebox]#include <stdlib.h>

#include <stdio.h>

#include <limits.h>

#include <unistd.h>

#include <cutil.h>

#include <multithreading.h>

#define MAXGPU 8

typedef struct {

int device;

int dataN;

int dataP;

} TGPUplan;

long datasize=10000000;

long loopsize=100000;

unsigned int * datah;

global void chkkernel (unsigned int * data, const int N)

{

unsigned int index = blockIdx.x * blockDim.x + threadIdx.x;

if (index < N) data[index]=data[index]+1.0;

}

static CUT_THREADPROC mythread(TGPUplan * plan)

{

int ID=plan->device;

CUDA_SAFE_CALL(cudaSetDevice(ID));

unsigned int *data;

CUDA_SAFE_CALL(cudaMalloc((void**) &data, plan->dataN*sizeof(unsigned int)));

CUDA_SAFE_CALL(cudaMemcpy(data,&datah[plan->dataP],plan->dataN*sizeof(unsigned int),cudaMemcpyHostToDevice));

int block_size = 256;

int n_blocks =  plan->dataN/block_size + ( plan->dataN%block_size == 0 ? 0:1 );

for(long i=0;i<loopsize;i++) chkkernel<<<n_blocks,block_size>>>(data, plan->dataN);                                    

CUDA_SAFE_CALL(cudaMemcpy(&datah[plan->dataP],data,plan->dataN*sizeof(unsigned int),cudaMemcpyDeviceToHost));

CUDA_SAFE_CALL(cudaFree(data));

CUT_THREADEND;

}

host int main(int argc , char * argv )

{

clock_t start,end;

TGPUplan plan[MAXGPU];

int GPU_N;

CUDA_SAFE_CALL(cudaGetDeviceCount(&GPU_N));

if (argc>1) GPU_N=atoi(argv[1]);

datah= (unsigned int*) malloc(datasize*sizeof(unsigned int));

for (long i=0;i<datasize;i++) datah[i]=0;  

long bef=0;

for (int i=0;i<GPU_N;i++)

{

    plan[i].dataN=datasize/GPU_N;

    printf("# GPU %d gets array of size %d\n",i,plan[i].dataN);

    plan[i].dataP=bef;

    plan[i].device=i;

    bef+=plan[i].dataN;

}

CUTThread * threadID = (CUTThread *)malloc(sizeof(CUTThread) * MAXGPU);

start=clock();

for (int i=0;i<GPU_N;i++)

{

    threadID[i]=cutStartThread((CUT_THREADROUTINE)mythread, (void*)(plan+i));

}

cutWaitForThreads(threadID,GPU_N);

free(threadID);

end = clock();

double sumideal=(double)datasize*(double)loopsize;

double sum=0.0;

for (long i=0;i<datasize;i++) sum+=(double) datah[i];

printf("# Difference = %f\n", float(sum-sumideal));

printf("# Time taken (%d GPUs) : %f sec.\n", GPU_N, (end-start)/(double)CLOCKS_PER_SEC);

}

[/codebox]

It is compiled with the NVCC 2.2 using the following options.

[codebox]nvcc --use_fast_math -m 64 -O3 -arch sm_13 -O3 -D$(DEF) -I/usr/local/cuda/include -I/home/mydir/NVIDIA_CUDA_SDK/common/inc -L/usr/local/cuda/lib -L/home/mydir/NVIDIA_CUDA_SDK/lib -lcutil test.cu -o test

[/codebox]

Running “./test” gives

[b]

GPU 0 gets array of size 5000000

GPU 1 gets array of size 5000000

Difference = 0.000000

Time taken (2 GPUs) : 98.970000 sec.[/b]

If I run “./test 1” (which forces the usage of one GPU) I obtain:

[b]

GPU 0 gets array of size 10000000

Difference = 0.000000

Time taken (1 GPUs) : 98.520000 sec.[/b]

First I thought that it might be a programming issue, but compiling the code under Windows Vista (32bit) gives me the expected speed-up, i.e. the test case takes only half the time when using two GPUs.

I’d really appreciate if anybody has a comment on that issue.

Thanks in advance.

Thomas

Uh, use a threading library with known behavior (read: not cutil) and time your performance with timers of known behavior (read: not cutil) and then see what happens.

Thanks for the post

OK. I will give it at try and change to pthreads. But why does it work for Windows and not for Linux? Is this a “feature” of cutil?

The timing is done by clock_t, which I believe is not belonging to cutil at all?

I don’t trust cutil for anything. Its behavior changes from version to version and is poorly defined. It should not be used by anyone who is not specifically writing an SDK sample.

Thanks for that direct answer External Media

BTW, I have changed the source now to use pthreads. Here’s the code

[codebox]include <stdlib.h>

include <stdio.h>

include <limits.h>

include <unistd.h>

include <cutil.h>

include <multithreading.h>

include <pthread.h>

define MAXGPU 8

define errexit(code,str) \

fprintf(stderr,“%s: %s\n”,(str),strerror(code)); \

exit(1);

typedef struct {

int device;

int dataN;

int dataP;

} TGPUplan;

long datasize=10000000;

long loopsize=100000;

unsigned int * datah;

global void chkkernel (unsigned int * data, const int N)

{

unsigned int index = blockIdx.x * blockDim.x + threadIdx.x;

if (index < N) data[index]=data[index]+1.0;

}

void *hola(void * arg)

{

TGPUplan * plan=(TGPUplan *) arg;

int ID=plan->device;

CUDA_SAFE_CALL(cudaSetDevice(ID));

unsigned int *data;

CUDA_SAFE_CALL(cudaMalloc((void**) &data, plan->dataN*sizeof(unsigned int)));

CUDA_SAFE_CALL(cudaMemcpy(data,&datah[plan->dataP],plan->dataN*sizeof(unsigned int),cudaMemcpyHostToDevice));

int block_size = 256;

int n_blocks =  plan->dataN/block_size + ( plan->dataN%block_size == 0 ? 0:1 );

for(long i=0;i<loopsize;i++) chkkernel<<<n_blocks,block_size>>>(data, plan->dataN);                                    

CUDA_SAFE_CALL(cudaMemcpy(&datah[plan->dataP],data,plan->dataN*sizeof(unsigned int),cudaMemcpyDeviceToHost));

CUDA_SAFE_CALL(cudaFree(data));

return arg;

}

host int main(int argc , char * argv )

{

clock_t start,end;

TGPUplan plan[MAXGPU];

int GPU_N;

CUDA_SAFE_CALL(cudaGetDeviceCount(&GPU_N));

if (argc>1) GPU_N=atoi(argv[1]);

datah= (unsigned int*) malloc(datasize*sizeof(unsigned int));

for (long i=0;i<datasize;i++) datah[i]=0;  

long bef=0;

for (int i=0;i<GPU_N;i++)

{

    plan[i].dataN=datasize/GPU_N;

    printf("# GPU %d gets array of size %d\n",i,plan[i].dataN);

    plan[i].dataP=bef;

    plan[i].device=i;

    bef+=plan[i].dataN;

}

start=clock();

int worker;

pthread_t threads[MAXGPU];            

int errcode;                          

int *status;                          

for (worker=0; worker<GPU_N; worker++)

{

    if (errcode=pthread_create(&threads[worker], NULL,hola, &plan[worker]))

    {

        errexit(errcode,"pthread_create");

    }

}

for (worker=0; worker<GPU_N; worker++)

{

if (errcode=pthread_join(threads[worker],(void **) &status))

    { 

        errexit(errcode,"pthread_join");

    }

    if (*status != worker)

    {

        fprintf(stderr,"thread %d terminated abnormally\n",worker);

    exit(1);

    }

}

end = clock();

double sumideal=(double)datasize*(double)loopsize;

double sum=0.0;

for (long i=0;i<datasize;i++) sum+=(double) datah[i];

printf("# Difference = %f\n", float(sum-sumideal));

printf("# Time taken (%d GPUs) : %f sec.\n", GPU_N, (end-start)/(double)CLOCKS_PER_SEC);

}

[/codebox]

Nevertheless, here’s what I get

[b]# GPU 0 gets array of size 5000000

GPU 1 gets array of size 5000000

Difference = 0.000000

Time taken (2 GPUs) : 99.040000 sec.

[/b]

And forcing to use one GPU

[b]# GPU 0 gets array of size 10000000

Difference = 0.000000

Time taken (1 GPUs) : 98.490000 sec.

[/b]

Now everything should be rather independent of cutils (except the CUDA_SAFE_CALL, should I remove them too???)

Do you have any ideas what else to try?

I have tried several threading libraries and different ways to measure the time taken, but there is no gain in using the 2nd card under Linux. What am I doing wrong? Is there anybody who succeeded to really use 2 cards under Linux AND get the performance he expected???
:wacko:

I have used two cards (3 even!) from different processes in Linux and gotten the full performance improvement I expected. I’ve never tried two threads in the same process, but I’m almost certain others have gotten that to work as well.

Thanks for the reply. At least this gives me hope that its possible.
BTW, can you please post the architecture you were running your software?

Not sure what you mean here. I just start two processes, and each one calls cudaSetDevice() with a different number.

I mean, which OS are you using? Ubuntu, Fedora, … ? 32 bit or 64bit ? Even the kernel number would help…

Unfortunately, I am relying that the code runs with threads. But since it works as expected under Windows and even the SDK samples work with threads I thought that it should be possible.

Anyway, thanks for your reply.

I solved the problem (if it ever has been a real one). The crucial point is neither to use timing and threading functions from “cutil” nor the clock() function from time.h (at least under Linux).

Here’s the modified code for those who are interested

[codebox]#include <stdlib.h>

#include <stdio.h>

#include <limits.h>

#include <unistd.h>

//#include <cutil.h>

#include <multithreading.h>

#include <pthread.h>

#define MAXGPU 8

#define errexit(code,str) \

fprintf(stderr,“%s: %s\n”,(str),strerror(code)); \

exit(1);

typedef struct {

int device;

int dataN;

int dataP;

} TGPUplan;

long datasize=10000000;

long loopsize=100000;

unsigned int * datah;

global void chkkernel (unsigned int * data, const int N)

{

unsigned int index = blockIdx.x * blockDim.x + threadIdx.x;

if (index < N) data[index]=data[index]+1.0;

}

void *hola(void * arg)

{

time_t rawtime;

struct tm * timeinfo;

time ( &rawtime );

timeinfo = localtime ( &rawtime );

TGPUplan * plan=(TGPUplan *) arg;

int ID=plan->device;

printf ("++ Job %d starts at %s", ID, asctime (timeinfo) );

cudaSetDevice(ID);

unsigned int *data;

cudaMalloc((void**) &data, plan->dataN*sizeof(unsigned int));

cudaMemcpy(data,&datah[plan->dataP],plan->dataN*sizeof(unsigned int),cudaMemcpyHostToDevice);

int block_size = 256;

int n_blocks =  plan->dataN/block_size + ( plan->dataN%block_size == 0 ? 0:1 );

for(long i=0;i<loopsize;i++) chkkernel<<<n_blocks,block_size>>>(data, plan->dataN);

cudaMemcpy(&datah[plan->dataP],data,plan->dataN*sizeof(unsigned int),cudaMemcpyDeviceToHost);

time ( &rawtime );

timeinfo = localtime ( &rawtime );

printf ("-- Job %d finishes at %s", ID, asctime (timeinfo) );

cudaFree(data);

return arg;

}

host int main(int argc , char * argv )

{

TGPUplan plan[MAXGPU];

int GPU_N;

cudaGetDeviceCount(&GPU_N);

if (argc>1) GPU_N=atoi(argv[1]);

datah= (unsigned int*) malloc(datasize*sizeof(unsigned int));

for (long i=0;i<datasize;i++) datah[i]=0;

long bef=0;

for (int i=0;i<GPU_N;i++)

{

    plan[i].dataN=datasize/GPU_N;

    printf("## GPU %d gets array of size %d\n",i,plan[i].dataN);

    plan[i].dataP=bef;

    plan[i].device=i;

    bef+=plan[i].dataN;

}

time_t rawtime;

struct tm * timeinfo;

time ( &rawtime );

timeinfo = localtime ( &rawtime );

printf ("** Program starts at %s", asctime (timeinfo) );

int worker;

pthread_t threads[MAXGPU];                /* holds thread info */

int errcode;                                /* holds pthread error code */

int *status;                                /* holds return code */

/* create the threads */

for (worker=0; worker<GPU_N; worker++)

{

    if (errcode=pthread_create(&threads[worker], NULL,hola, &plan[worker]))

    {

        errexit(errcode,"pthread_create");

    }

}

/* reap the threads as they exit */

for (worker=0; worker<GPU_N; worker++)

{

    /* wait for thread to terminate */

    if (errcode=pthread_join(threads[worker],(void **) &status))

    {

        errexit(errcode,"pthread_join");

    }

    /* check thread's exit status and release its resources */

    if (*status != worker)

    {

        fprintf(stderr,"thread %d terminated abnormally\n",worker);

    exit(1);

    }

}

time ( &rawtime );

timeinfo = localtime ( &rawtime );

printf ("** Program ends at  %s", asctime (timeinfo) );

double sumideal=(double)datasize*(double)loopsize;

double sum=0.0;

for (long i=0;i<datasize;i++) sum+=(double) datah[i];

printf("# Difference = %f\n", float(sum-sumideal));

}

[/codebox]

After this modifications I got with 2 GPUs:

GPU 0 gets array of size 5000000

GPU 1 gets array of size 5000000

** Program starts at Wed Jun 17 10:23:06 2009

++ Job 0 starts at Wed Jun 17 10:23:06 2009

++ Job 1 starts at Wed Jun 17 10:23:06 2009

– Job 1 finishes at Wed Jun 17 10:23:55 2009

– Job 0 finishes at Wed Jun 17 10:23:56 2009

** Program ends at Wed Jun 17 10:23:56 2009

Difference = 0.000000

Which is about two times (50 sec. vs. 98 sec.) faster than with one GPU:

GPU 0 gets array of size 10000000

** Program starts at Wed Jun 17 10:24:06 2009

++ Job 0 starts at Wed Jun 17 10:24:06 2009

– Job 0 finishes at Wed Jun 17 10:25:44 2009

** Program ends at Wed Jun 17 10:25:44 2009

Difference = 0.000000

Thanks to tmurray for valuable comments helping to understand the problem behind.