Newbie question on the return values of a vector addition

farseeing · November 10, 2010, 10:44am

I have written a simple kernel function to do the vector addition (simply adding two arrays d_x and d_y element by element in the kernel).

But the results are all zeros when i tried to print out the final results h_z copied out from the d_z in the gpu.

I think there must be some naive mistake i have made in my code.

But i did not quite catch it. Wish someone can help to point out.

Thanks in advance.

#include <stdlib.h>

#include <stdio.h>

#include <cutil_inline.h>

//

// kernel routine

// 

__global__ void my_first_kernel(float *z, float *x, float *y)

{

	int tid = threadIdx.x + blockDim.x*blockIdx.x;

	z[tid] = x[tid]+y[tid];

}   

//

// main code

//

int main(int argc, char **argv)

{

	float *h_x, *h_y, *h_z;

	float *d_x, *d_y, *d_z;

	int   nblocks, nthreads, nsize, n; 

	// initialise card

	cutilDeviceInit(argc, argv);

	// set number of blocks, and threads per block

	nblocks  = 2;

	nthreads = 8;

	nsize    = nblocks*nthreads ;

	// allocate memory for array

	h_x = (float *)malloc(nsize*sizeof(float));

	h_y = (float *)malloc(nsize*sizeof(float));

	h_z = (float *)malloc(nsize*sizeof(float));

	for (n=0; n<nsize; n++)

	{

		h_z[n] = -1;

		h_x[n] = n;

		h_y[n] = n;

	}

	cutilSafeCall(cudaMalloc((void **)&d_x, nsize*sizeof(float)));

	cutilSafeCall(cudaMalloc((void **)&d_y, nsize*sizeof(float)));

	cutilSafeCall(cudaMalloc((void **)&d_z, nsize*sizeof(float)));

	cutilSafeCall( cudaMemcpy(d_x,h_x,nsize*sizeof(float),

				cudaMemcpyHostToDevice) );

	cutilSafeCall( cudaMemcpy(d_y,h_y,nsize*sizeof(float),

				cudaMemcpyHostToDevice) );

	for (n=0; n<nsize; n++) printf(" n,  x  =  %d  %g \n",n,h_x[n]);

	for (n=0; n<nsize; n++) printf(" n,  y  =  %d  %g \n",n,h_y[n]);

	// execute kernel

	cudaThreadSynchronize();

	my_first_kernel<<<nblocks,nthreads>>>(d_z,d_x,d_y);

	// copy back results and print them out

	cudaThreadSynchronize();

	cutilSafeCall( cudaMemcpy(h_z,d_z,nsize*sizeof(float),

				cudaMemcpyDeviceToHost) );

	for (n=0; n<nsize; n++) printf(" n,  z  =  %d  %g \n",n,h_z[n]);

	// free memory 

	cutilSafeCall(cudaFree(d_x));

	cutilSafeCall(cudaFree(d_y));

	cutilSafeCall(cudaFree(d_z));

	free(h_x);

	free(h_y);

	free(h_z);

	return 0;

}

farseeing · November 10, 2010, 10:44am

I have written a simple kernel function to do the vector addition (simply adding two arrays d_x and d_y element by element in the kernel).

But the results are all zeros when i tried to print out the final results h_z copied out from the d_z in the gpu.

I think there must be some naive mistake i have made in my code.

But i did not quite catch it. Wish someone can help to point out.

Thanks in advance.

#include <stdlib.h>

#include <stdio.h>

#include <cutil_inline.h>

//

// kernel routine

// 

__global__ void my_first_kernel(float *z, float *x, float *y)

{

	int tid = threadIdx.x + blockDim.x*blockIdx.x;

	z[tid] = x[tid]+y[tid];

}   

//

// main code

//

int main(int argc, char **argv)

{

	float *h_x, *h_y, *h_z;

	float *d_x, *d_y, *d_z;

	int   nblocks, nthreads, nsize, n; 

	// initialise card

	cutilDeviceInit(argc, argv);

	// set number of blocks, and threads per block

	nblocks  = 2;

	nthreads = 8;

	nsize    = nblocks*nthreads ;

	// allocate memory for array

	h_x = (float *)malloc(nsize*sizeof(float));

	h_y = (float *)malloc(nsize*sizeof(float));

	h_z = (float *)malloc(nsize*sizeof(float));

	for (n=0; n<nsize; n++)

	{

		h_z[n] = -1;

		h_x[n] = n;

		h_y[n] = n;

	}

	cutilSafeCall(cudaMalloc((void **)&d_x, nsize*sizeof(float)));

	cutilSafeCall(cudaMalloc((void **)&d_y, nsize*sizeof(float)));

	cutilSafeCall(cudaMalloc((void **)&d_z, nsize*sizeof(float)));

	cutilSafeCall( cudaMemcpy(d_x,h_x,nsize*sizeof(float),

				cudaMemcpyHostToDevice) );

	cutilSafeCall( cudaMemcpy(d_y,h_y,nsize*sizeof(float),

				cudaMemcpyHostToDevice) );

	for (n=0; n<nsize; n++) printf(" n,  x  =  %d  %g \n",n,h_x[n]);

	for (n=0; n<nsize; n++) printf(" n,  y  =  %d  %g \n",n,h_y[n]);

	// execute kernel

	cudaThreadSynchronize();

	my_first_kernel<<<nblocks,nthreads>>>(d_z,d_x,d_y);

	// copy back results and print them out

	cudaThreadSynchronize();

	cutilSafeCall( cudaMemcpy(h_z,d_z,nsize*sizeof(float),

				cudaMemcpyDeviceToHost) );

	for (n=0; n<nsize; n++) printf(" n,  z  =  %d  %g \n",n,h_z[n]);

	// free memory 

	cutilSafeCall(cudaFree(d_x));

	cutilSafeCall(cudaFree(d_y));

	cutilSafeCall(cudaFree(d_z));

	free(h_x);

	free(h_y);

	free(h_z);

	return 0;

}

farseeing · November 10, 2010, 11:17am

i got the answer by myself.

It is caused by the flag i passed to the nvcc in the Makefile.

I have used a wrong -arch value in the flag.

I have written a simple kernel function to do the vector addition (simply adding two arrays d_x and d_y element by element in the kernel).

But the results are all zeros when i tried to print out the final results h_z copied out from the d_z in the gpu.

I think there must be some naive mistake i have made in my code.

But i did not quite catch it. Wish someone can help to point out.

Thanks in advance.

#include <stdlib.h>

#include <stdio.h>

#include <cutil_inline.h>

//

// kernel routine

// 

__global__ void my_first_kernel(float *z, float *x, float *y)

{

	int tid = threadIdx.x + blockDim.x*blockIdx.x;

	z[tid] = x[tid]+y[tid];

}   

//

// main code

//

int main(int argc, char **argv)

{

	float *h_x, *h_y, *h_z;

	float *d_x, *d_y, *d_z;

	int   nblocks, nthreads, nsize, n; 

	// initialise card

	cutilDeviceInit(argc, argv);

	// set number of blocks, and threads per block

	nblocks  = 2;

	nthreads = 8;

	nsize    = nblocks*nthreads ;

	// allocate memory for array

	h_x = (float *)malloc(nsize*sizeof(float));

	h_y = (float *)malloc(nsize*sizeof(float));

	h_z = (float *)malloc(nsize*sizeof(float));

	for (n=0; n<nsize; n++)

	{

		h_z[n] = -1;

		h_x[n] = n;

		h_y[n] = n;

	}

	cutilSafeCall(cudaMalloc((void **)&d_x, nsize*sizeof(float)));

	cutilSafeCall(cudaMalloc((void **)&d_y, nsize*sizeof(float)));

	cutilSafeCall(cudaMalloc((void **)&d_z, nsize*sizeof(float)));

	cutilSafeCall( cudaMemcpy(d_x,h_x,nsize*sizeof(float),

				cudaMemcpyHostToDevice) );

	cutilSafeCall( cudaMemcpy(d_y,h_y,nsize*sizeof(float),

				cudaMemcpyHostToDevice) );

	for (n=0; n<nsize; n++) printf(" n,  x  =  %d  %g \n",n,h_x[n]);

	for (n=0; n<nsize; n++) printf(" n,  y  =  %d  %g \n",n,h_y[n]);

	// execute kernel

	cudaThreadSynchronize();

	my_first_kernel<<<nblocks,nthreads>>>(d_z,d_x,d_y);

	// copy back results and print them out

	cudaThreadSynchronize();

	cutilSafeCall( cudaMemcpy(h_z,d_z,nsize*sizeof(float),

				cudaMemcpyDeviceToHost) );

	for (n=0; n<nsize; n++) printf(" n,  z  =  %d  %g \n",n,h_z[n]);

	// free memory 

	cutilSafeCall(cudaFree(d_x));

	cutilSafeCall(cudaFree(d_y));

	cutilSafeCall(cudaFree(d_z));

	free(h_x);

	free(h_y);

	free(h_z);

	return 0;

}

farseeing · November 10, 2010, 11:17am

i got the answer by myself.

It is caused by the flag i passed to the nvcc in the Makefile.

I have used a wrong -arch value in the flag.

I have written a simple kernel function to do the vector addition (simply adding two arrays d_x and d_y element by element in the kernel).

But the results are all zeros when i tried to print out the final results h_z copied out from the d_z in the gpu.

I think there must be some naive mistake i have made in my code.

But i did not quite catch it. Wish someone can help to point out.

Thanks in advance.

#include <stdlib.h>

#include <stdio.h>

#include <cutil_inline.h>

//

// kernel routine

// 

__global__ void my_first_kernel(float *z, float *x, float *y)

{

	int tid = threadIdx.x + blockDim.x*blockIdx.x;

	z[tid] = x[tid]+y[tid];

}   

//

// main code

//

int main(int argc, char **argv)

{

	float *h_x, *h_y, *h_z;

	float *d_x, *d_y, *d_z;

	int   nblocks, nthreads, nsize, n; 

	// initialise card

	cutilDeviceInit(argc, argv);

	// set number of blocks, and threads per block

	nblocks  = 2;

	nthreads = 8;

	nsize    = nblocks*nthreads ;

	// allocate memory for array

	h_x = (float *)malloc(nsize*sizeof(float));

	h_y = (float *)malloc(nsize*sizeof(float));

	h_z = (float *)malloc(nsize*sizeof(float));

	for (n=0; n<nsize; n++)

	{

		h_z[n] = -1;

		h_x[n] = n;

		h_y[n] = n;

	}

	cutilSafeCall(cudaMalloc((void **)&d_x, nsize*sizeof(float)));

	cutilSafeCall(cudaMalloc((void **)&d_y, nsize*sizeof(float)));

	cutilSafeCall(cudaMalloc((void **)&d_z, nsize*sizeof(float)));

	cutilSafeCall( cudaMemcpy(d_x,h_x,nsize*sizeof(float),

				cudaMemcpyHostToDevice) );

	cutilSafeCall( cudaMemcpy(d_y,h_y,nsize*sizeof(float),

				cudaMemcpyHostToDevice) );

	for (n=0; n<nsize; n++) printf(" n,  x  =  %d  %g \n",n,h_x[n]);

	for (n=0; n<nsize; n++) printf(" n,  y  =  %d  %g \n",n,h_y[n]);

	// execute kernel

	cudaThreadSynchronize();

	my_first_kernel<<<nblocks,nthreads>>>(d_z,d_x,d_y);

	// copy back results and print them out

	cudaThreadSynchronize();

	cutilSafeCall( cudaMemcpy(h_z,d_z,nsize*sizeof(float),

				cudaMemcpyDeviceToHost) );

	for (n=0; n<nsize; n++) printf(" n,  z  =  %d  %g \n",n,h_z[n]);

	// free memory 

	cutilSafeCall(cudaFree(d_x));

	cutilSafeCall(cudaFree(d_y));

	cutilSafeCall(cudaFree(d_z));

	free(h_x);

	free(h_y);

	free(h_z);

	return 0;

}

Topic		Replies	Views
Getting started with CUDA ... cannot add simple vectors CUDA Programming and Performance	9	20975	January 31, 2011
vecadd outputs all zeros Teaching and Curriculum Support	2	1400	September 4, 2013
Zero output in basic Vector Addition application in CUDA CUDA Programming and Performance	8	4901	January 18, 2011
Help! Sum of vectors CUDA Programming and Performance	7	918	June 16, 2011
why vector addition sdk example fails when using zero copy? CUDA Programming and Performance	1	821	January 20, 2012
Why it doesnt work ? Simple program that adds two vectors CUDA Programming and Performance	6	3904	March 18, 2010
The kernel always returns values equal to zero CUDA Programming and Performance	10	8088	February 2, 2018
My first program it doesn't behave as expected CUDA Programming and Performance	2	2506	July 19, 2009
VectorAdd can compile, but output is odd. CUDA Programming and Performance	2	650	January 5, 2012
Result of simple vector summation is not correct. CUDA Programming and Performance	2	784	July 23, 2013

Newbie question on the return values of a vector addition

Related topics