I have written a simple kernel function to do the vector addition (simply adding two arrays d_x and d_y element by element in the kernel).
But the results are all zeros when i tried to print out the final results h_z copied out from the d_z in the gpu.
I think there must be some naive mistake i have made in my code.
But i did not quite catch it. Wish someone can help to point out.
Thanks in advance.
#include <stdlib.h>
#include <stdio.h>
#include <cutil_inline.h>
//
// kernel routine
//
__global__ void my_first_kernel(float *z, float *x, float *y)
{
int tid = threadIdx.x + blockDim.x*blockIdx.x;
z[tid] = x[tid]+y[tid];
}
//
// main code
//
int main(int argc, char **argv)
{
float *h_x, *h_y, *h_z;
float *d_x, *d_y, *d_z;
int nblocks, nthreads, nsize, n;
// initialise card
cutilDeviceInit(argc, argv);
// set number of blocks, and threads per block
nblocks = 2;
nthreads = 8;
nsize = nblocks*nthreads ;
// allocate memory for array
h_x = (float *)malloc(nsize*sizeof(float));
h_y = (float *)malloc(nsize*sizeof(float));
h_z = (float *)malloc(nsize*sizeof(float));
for (n=0; n<nsize; n++)
{
h_z[n] = -1;
h_x[n] = n;
h_y[n] = n;
}
cutilSafeCall(cudaMalloc((void **)&d_x, nsize*sizeof(float)));
cutilSafeCall(cudaMalloc((void **)&d_y, nsize*sizeof(float)));
cutilSafeCall(cudaMalloc((void **)&d_z, nsize*sizeof(float)));
cutilSafeCall( cudaMemcpy(d_x,h_x,nsize*sizeof(float),
cudaMemcpyHostToDevice) );
cutilSafeCall( cudaMemcpy(d_y,h_y,nsize*sizeof(float),
cudaMemcpyHostToDevice) );
for (n=0; n<nsize; n++) printf(" n, x = %d %g \n",n,h_x[n]);
for (n=0; n<nsize; n++) printf(" n, y = %d %g \n",n,h_y[n]);
// execute kernel
cudaThreadSynchronize();
my_first_kernel<<<nblocks,nthreads>>>(d_z,d_x,d_y);
// copy back results and print them out
cudaThreadSynchronize();
cutilSafeCall( cudaMemcpy(h_z,d_z,nsize*sizeof(float),
cudaMemcpyDeviceToHost) );
for (n=0; n<nsize; n++) printf(" n, z = %d %g \n",n,h_z[n]);
// free memory
cutilSafeCall(cudaFree(d_x));
cutilSafeCall(cudaFree(d_y));
cutilSafeCall(cudaFree(d_z));
free(h_x);
free(h_y);
free(h_z);
return 0;
}