Newbie question on the return values of a vector addition

I have written a simple kernel function to do the vector addition (simply adding two arrays d_x and d_y element by element in the kernel).

But the results are all zeros when i tried to print out the final results h_z copied out from the d_z in the gpu.

I think there must be some naive mistake i have made in my code.

But i did not quite catch it. Wish someone can help to point out.

Thanks in advance.

#include <stdlib.h>

#include <stdio.h>

#include <cutil_inline.h>

//

// kernel routine

// 

__global__ void my_first_kernel(float *z, float *x, float *y)

{

	int tid = threadIdx.x + blockDim.x*blockIdx.x;

	z[tid] = x[tid]+y[tid];

}   

//

// main code

//

int main(int argc, char **argv)

{

	float *h_x, *h_y, *h_z;

	float *d_x, *d_y, *d_z;

	int   nblocks, nthreads, nsize, n; 

	// initialise card

	cutilDeviceInit(argc, argv);

	// set number of blocks, and threads per block

	nblocks  = 2;

	nthreads = 8;

	nsize    = nblocks*nthreads ;

	// allocate memory for array

	h_x = (float *)malloc(nsize*sizeof(float));

	h_y = (float *)malloc(nsize*sizeof(float));

	h_z = (float *)malloc(nsize*sizeof(float));

	for (n=0; n<nsize; n++)

	{

		h_z[n] = -1;

		h_x[n] = n;

		h_y[n] = n;

	}

	cutilSafeCall(cudaMalloc((void **)&d_x, nsize*sizeof(float)));

	cutilSafeCall(cudaMalloc((void **)&d_y, nsize*sizeof(float)));

	cutilSafeCall(cudaMalloc((void **)&d_z, nsize*sizeof(float)));

	cutilSafeCall( cudaMemcpy(d_x,h_x,nsize*sizeof(float),

				cudaMemcpyHostToDevice) );

	cutilSafeCall( cudaMemcpy(d_y,h_y,nsize*sizeof(float),

				cudaMemcpyHostToDevice) );

	for (n=0; n<nsize; n++) printf(" n,  x  =  %d  %g \n",n,h_x[n]);

	for (n=0; n<nsize; n++) printf(" n,  y  =  %d  %g \n",n,h_y[n]);

	// execute kernel

	cudaThreadSynchronize();

	my_first_kernel<<<nblocks,nthreads>>>(d_z,d_x,d_y);

	// copy back results and print them out

	cudaThreadSynchronize();

	cutilSafeCall( cudaMemcpy(h_z,d_z,nsize*sizeof(float),

				cudaMemcpyDeviceToHost) );

	for (n=0; n<nsize; n++) printf(" n,  z  =  %d  %g \n",n,h_z[n]);

	// free memory 

	cutilSafeCall(cudaFree(d_x));

	cutilSafeCall(cudaFree(d_y));

	cutilSafeCall(cudaFree(d_z));

	free(h_x);

	free(h_y);

	free(h_z);

	return 0;

}

I have written a simple kernel function to do the vector addition (simply adding two arrays d_x and d_y element by element in the kernel).

But the results are all zeros when i tried to print out the final results h_z copied out from the d_z in the gpu.

I think there must be some naive mistake i have made in my code.

But i did not quite catch it. Wish someone can help to point out.

Thanks in advance.

#include <stdlib.h>

#include <stdio.h>

#include <cutil_inline.h>

//

// kernel routine

// 

__global__ void my_first_kernel(float *z, float *x, float *y)

{

	int tid = threadIdx.x + blockDim.x*blockIdx.x;

	z[tid] = x[tid]+y[tid];

}   

//

// main code

//

int main(int argc, char **argv)

{

	float *h_x, *h_y, *h_z;

	float *d_x, *d_y, *d_z;

	int   nblocks, nthreads, nsize, n; 

	// initialise card

	cutilDeviceInit(argc, argv);

	// set number of blocks, and threads per block

	nblocks  = 2;

	nthreads = 8;

	nsize    = nblocks*nthreads ;

	// allocate memory for array

	h_x = (float *)malloc(nsize*sizeof(float));

	h_y = (float *)malloc(nsize*sizeof(float));

	h_z = (float *)malloc(nsize*sizeof(float));

	for (n=0; n<nsize; n++)

	{

		h_z[n] = -1;

		h_x[n] = n;

		h_y[n] = n;

	}

	cutilSafeCall(cudaMalloc((void **)&d_x, nsize*sizeof(float)));

	cutilSafeCall(cudaMalloc((void **)&d_y, nsize*sizeof(float)));

	cutilSafeCall(cudaMalloc((void **)&d_z, nsize*sizeof(float)));

	cutilSafeCall( cudaMemcpy(d_x,h_x,nsize*sizeof(float),

				cudaMemcpyHostToDevice) );

	cutilSafeCall( cudaMemcpy(d_y,h_y,nsize*sizeof(float),

				cudaMemcpyHostToDevice) );

	for (n=0; n<nsize; n++) printf(" n,  x  =  %d  %g \n",n,h_x[n]);

	for (n=0; n<nsize; n++) printf(" n,  y  =  %d  %g \n",n,h_y[n]);

	// execute kernel

	cudaThreadSynchronize();

	my_first_kernel<<<nblocks,nthreads>>>(d_z,d_x,d_y);

	// copy back results and print them out

	cudaThreadSynchronize();

	cutilSafeCall( cudaMemcpy(h_z,d_z,nsize*sizeof(float),

				cudaMemcpyDeviceToHost) );

	for (n=0; n<nsize; n++) printf(" n,  z  =  %d  %g \n",n,h_z[n]);

	// free memory 

	cutilSafeCall(cudaFree(d_x));

	cutilSafeCall(cudaFree(d_y));

	cutilSafeCall(cudaFree(d_z));

	free(h_x);

	free(h_y);

	free(h_z);

	return 0;

}

i got the answer by myself.

It is caused by the flag i passed to the nvcc in the Makefile.

I have used a wrong -arch value in the flag.

i got the answer by myself.

It is caused by the flag i passed to the nvcc in the Makefile.

I have used a wrong -arch value in the flag.