Hi,
Following is the sample CUDA program. It has two arrays(a & b) of each 10 elements. The array elements will be summed and stored in third array(c).
#include<stdio.h>
#include<cuda.h>
#define N 10
global void add(int *ca, int *cb, int *cc)
{
int i;
for(i=0; i<N; i++)
cc[i] = ca[i] + cb[i];
}
int main()
{
int a[N], b[N], c[N], i, *dev_a, *dev_b, *dev_c;
cudaMalloc(&dev_a, Nsizeof(int));
cudaMalloc(&dev_b, Nsizeof(int));
cudaMalloc(&dev_c, N*sizeof(int));
for (i=0; i<N; i++)
{
a[i]=b[i]=i;
}
printf(“\n N = %d \n”, N);
printf("a[5] = %d\n b[5] = %d ", a[5], b[5] );
cudaMemcpy(dev_a,a,Nsizeof(int),cudaMemcpyHostToDevice);
cudaMemcpy(dev_b,b,Nsizeof(int),cudaMemcpyHostToDevice);
add <<< N , 1 >>> ( dev_a, dev_b, dev_c );
cudaMemcpy(c,dev_c,N*sizeof(int),cudaMemcpyDeviceToHost);
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
printf(“\n THE CONTENTS ARE AS FOLLOWS: \n”);
for(i=0; i<N; i++)
printf("\t c[%d] = %d ",i, c[i]);
printf(“\n\n”);
return 0;
}
Compilation is smooth:
nvcc testarray.cu -o testarray
Output is incorrect:
time ./testarray
N = 10
a[5] = 5
b[5] = 5
THE CONTENTS ARE AS FOLLOWS:
c[0] = 612178064 c[1] = 32767 c[2] = 6308204 c[3] = 0 c[4] = 612178424 c[5] = 32767 c[6] = 612178408 c[7] = 32767 c[8] = 1 c[9] = 0
real 0m30.721s
user 0m0.051s
sys 0m30.318s
What’s going wrong here? And why its taking long time of 30 seconds for executing this small program.
Thank you