I start a small cuda program (only test the correctness)
and I can get correct result when using device emulation mode
but when I turn to GPU mode, the result is wrong.
Can anyone help me? Thank you in advance.
The program is only to sum ten numbers.
The following is my program. (On windows XP, Geforece 8800GT, cuda2.0)
#include <stdio.h>
#define THREAD_NUM 1
global void sump_on_device(int* number,int size, int *sum)
{
const int tid=threadIdx.x;
int myfrom,myend,avgsize;
avgsize=size/THREAD_NUM;
myfrom=tid*avgsize;
if(myfrom>=size) myend=myfrom;
myend=myfrom+avgsize;
if(tid==(THREAD_NUM-1)) myend=size;
int p=0;
for(int i=myfrom;i<myend;i++)
p+=number[i];
sum[tid]=p;
// printf(“%d %d\n”,tid,sum[tid]);
return;
}
extern “C” void SUMP_ON_DEVICE(int* number, int* size, int* sum)
{
int* numberD;
int* sumD;
int* result;
result=(int *)malloc(sizeof(int)*THREAD_NUM);
cudaMalloc( (void **)&numberD,(*size)*sizeof(int));
cudaMalloc( (void **)&sumD,sizeof(int)*THREAD_NUM);
cudaMemcpy(numberD,number,(*size)*sizeof(int),cudaMemcpyHost
ToDevice);
sump_on_device<<<1,THREAD_NUM,0>>>(numberD,*size,sumD);
cudaMemcpy(result,sumD,THREAD_NUM*sizeof(int),cudaMemcpyDevi
ceToHost);
*sum=0;
for(int i=0;i<THREAD_NUM;i++)
*sum+=result[i];
cudaFree(numberD);
cudaFree(sumD);
free(result);
}
int main(void)
{
int data_size=10;
int *number;
number=new int[data_size];
for(int i=0;i<data_size;i++)
{
number[i]=i;
printf("%d %d\n",i,number[i]);
}
int SumP_Host=0;
for(int i=0;i<data_size;i++)
SumP_Host+=number[i];
printf("Results on host: %d \n",SumP_Host);
int SumP_Device;
SUMP_ON_DEVICE(number, &data_size, &SumP_Device);
printf("Results on device: %d \n",SumP_Device);
delete []number;
return 0;
}
The running result on device emulation mode*
0 0
1 1
2 2
3 3
4 4
5 5
6 6
7 7
8 8
9 9
Results on host: 45
Results on device: 45
The running result on real GPU mode
0 0
1 1
2 2
3 3
4 4
5 5
6 6
7 7
8 8
9 9
Results on host: 45
Results on device: 3824584