Hi,
I hope anybody reads this and has an idea what I am doing wrong or why the piece of code is not working under Linux.
I have a Fedore 9 (64bit) machine running with 2 GTX280s which are nicely detected. But when running a multi-GPU program I get the same performance as a single GPU gives me.
Here’s the code:
[codebox]#include <stdlib.h>
#include <stdio.h>
#include <limits.h>
#include <unistd.h>
#include <cutil.h>
#include <multithreading.h>
#define MAXGPU 8
typedef struct {
int device;
int dataN;
int dataP;
} TGPUplan;
long datasize=10000000;
long loopsize=100000;
unsigned int * datah;
global void chkkernel (unsigned int * data, const int N)
{
unsigned int index = blockIdx.x * blockDim.x + threadIdx.x;
if (index < N) data[index]=data[index]+1.0;
}
static CUT_THREADPROC mythread(TGPUplan * plan)
{
int ID=plan->device;
CUDA_SAFE_CALL(cudaSetDevice(ID));
unsigned int *data;
CUDA_SAFE_CALL(cudaMalloc((void**) &data, plan->dataN*sizeof(unsigned int)));
CUDA_SAFE_CALL(cudaMemcpy(data,&datah[plan->dataP],plan->dataN*sizeof(unsigned int),cudaMemcpyHostToDevice));
int block_size = 256;
int n_blocks = plan->dataN/block_size + ( plan->dataN%block_size == 0 ? 0:1 );
for(long i=0;i<loopsize;i++) chkkernel<<<n_blocks,block_size>>>(data, plan->dataN);
CUDA_SAFE_CALL(cudaMemcpy(&datah[plan->dataP],data,plan->dataN*sizeof(unsigned int),cudaMemcpyDeviceToHost));
CUDA_SAFE_CALL(cudaFree(data));
CUT_THREADEND;
}
host int main(int argc , char * argv )
{
clock_t start,end;
TGPUplan plan[MAXGPU];
int GPU_N;
CUDA_SAFE_CALL(cudaGetDeviceCount(&GPU_N));
if (argc>1) GPU_N=atoi(argv[1]);
datah= (unsigned int*) malloc(datasize*sizeof(unsigned int));
for (long i=0;i<datasize;i++) datah[i]=0;
long bef=0;
for (int i=0;i<GPU_N;i++)
{
plan[i].dataN=datasize/GPU_N;
printf("# GPU %d gets array of size %d\n",i,plan[i].dataN);
plan[i].dataP=bef;
plan[i].device=i;
bef+=plan[i].dataN;
}
CUTThread * threadID = (CUTThread *)malloc(sizeof(CUTThread) * MAXGPU);
start=clock();
for (int i=0;i<GPU_N;i++)
{
threadID[i]=cutStartThread((CUT_THREADROUTINE)mythread, (void*)(plan+i));
}
cutWaitForThreads(threadID,GPU_N);
free(threadID);
end = clock();
double sumideal=(double)datasize*(double)loopsize;
double sum=0.0;
for (long i=0;i<datasize;i++) sum+=(double) datah[i];
printf("# Difference = %f\n", float(sum-sumideal));
printf("# Time taken (%d GPUs) : %f sec.\n", GPU_N, (end-start)/(double)CLOCKS_PER_SEC);
}
[/codebox]
It is compiled with the NVCC 2.2 using the following options.
[codebox]nvcc --use_fast_math -m 64 -O3 -arch sm_13 -O3 -D$(DEF) -I/usr/local/cuda/include -I/home/mydir/NVIDIA_CUDA_SDK/common/inc -L/usr/local/cuda/lib -L/home/mydir/NVIDIA_CUDA_SDK/lib -lcutil test.cu -o test
[/codebox]
Running “./test” gives
[b]
GPU 0 gets array of size 5000000
GPU 1 gets array of size 5000000
Difference = 0.000000
Time taken (2 GPUs) : 98.970000 sec.[/b]
If I run “./test 1” (which forces the usage of one GPU) I obtain:
[b]
GPU 0 gets array of size 10000000
Difference = 0.000000
Time taken (1 GPUs) : 98.520000 sec.[/b]
First I thought that it might be a programming issue, but compiling the code under Windows Vista (32bit) gives me the expected speed-up, i.e. the test case takes only half the time when using two GPUs.
I’d really appreciate if anybody has a comment on that issue.
Thanks in advance.
Thomas