Zero output in basic Vector Addition application in CUDA

Hi Everyone,

I spent a lot time fixing the bug in the following Vector Addition application (There are both GPU and CPU computing in the code sample below). All of the output for global kernel function is 0! I am using CUDA Toolkit 3.2 and Driver 260.99. Graphic card is NVIDIA GTX480. OS: Win7, 64-bit.

I appreciate so much if anyone can provide me any hint where the bug is.

Thank you!


#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <cutil.h>
#include <windows.h>

//#define VAR 100000000
#define VAR 8

global void vecAdd_d(float *A, float *B, float *C, int N) {

int tid = blockDim.x * blockIdx.x + threadIdx.x;
if( tid < N)
	C[tid] = A[tid] + B[tid];


void vecAdd_h(float *A, float *B, float *C, int N) {

for( int i = 0; i < N; i++){
	C[i] = A[i] + B[i]; 

// printf("%f\n",C[i]);

__declspec(dllexport) int main() {

float *A_h;
float *B_h;
float *C_h;

float *A_d;
float *B_d;
float *C_d;

printf("**************GPU Processing****************\n\n");
printf("Device Initialization ...\n\n");

A_h = (float *) (malloc(sizeof(float) * VAR));
B_h = (float *) (malloc(sizeof(float) * VAR));
C_h = (float *) (malloc(sizeof(float) * VAR));

cudaMalloc( (void**)&A_d, sizeof(float) * VAR);
cudaMalloc( (void**)&B_d, sizeof(float) * VAR);
cudaMalloc( (void**)&C_d, sizeof(float) * VAR);

printf("Memory Allocation is Done!\n");

for (int i = 0; i < VAR; i++){
	A_h[i] = 2.0;
	B_h[i] = 2.0;

cudaMemcpy(A_d, A_h, sizeof(float) * VAR, cudaMemcpyHostToDevice);
cudaMemcpy(B_d, B_h, sizeof(float) * VAR, cudaMemcpyHostToDevice);

printf("Data copied from Host to Device!\n");

int threadsPerBlock = 4;
int blocksPerGrid = (VAR + threadsPerBlock -1)/threadsPerBlock;
//dim2 dimBlock(threadsPerBlock, 1);
//dim2 dimGrid(blocksPerGrid, 1);

printf("Thread configuration is Done!\n\n");

// Kernel invocation 
printf("Invoking Kernel functions...\n");
LARGE_INTEGER curFreq_d, curStart_d, curEnd_d;

//vecAdd_d<<<dimGrid, dimBlock>>>(A_d, B_d, C_d, VAR);
vecAdd_d<<<blocksPerGrid, threadsPerBlock>>>(A_d, B_d, C_d, VAR);


cudaMemcpy(C_h, C_d, sizeof(float) * VAR, cudaMemcpyDeviceToHost);
printf("Data copied from Device to Host!\n");

printf("Device adding result:\n");
for (int i = 0; i < VAR; i++)
	printf("line: %f\n", C_h[i]);

printf("Device memory space is Freed!\n\n");
double time_d = (double)(curEnd_d.QuadPart-curStart_d.QuadPart)/curFreq_d.QuadPart;
printf("Device Executing Time: %f(ms)\n", time_d * 1000);
printf("**************GPU Processing is Done****************\n\n");

printf("**************CPU Processing****************\n\n");
LARGE_INTEGER curFreq_h, curStart_h, curEnd_h;

vecAdd_h(A_h, B_h, C_h, VAR);

printf("Host adding result:\n");
for (int i = 0; i < VAR; i++)
	printf("line: %f\n", C_h[i]);
double time_h = (double)(curEnd_h.QuadPart-curStart_h.QuadPart)/curFreq_h.QuadPart;
printf("Host Executing Time = %f(ms)\n",  time_h * 1000);
printf("**************CPU Processing is Done****************\n\n");

printf("Vector Size = %d\n", VAR);
printf("Speedup = %f\n", time_h/time_d);

delete A_h;
delete B_h;
delete C_h;

return 0;


Since you have no error trapping I assume you’re running this in the debugger (?)

Did you tried with dim3 (dim,1,1)?


Thank you for your reply!

I am not quite sure your dim3(dim,1,1). Would you mind to write me more about this configuration format? For my application, I guess one dimension array is enough…

If you also have available CUDA environment, it will be great to prove if it works on another CUDA machine.




Thank you for your reply!

Yes. I am in Debugging mode. I just changed it to Release mode. Error message said cutil32.lib cannot be found.

Another thing, in Debug mode I was using. The following basic program worked well which has no thread assign issue in kernel function. So, I guess it may be due to incorrect thread configuration setting. But I really have no idea what’s wrong with that.





  • This is a example of the CUDA program.


#include <stdio.h>

#include <stdlib.h>

#include <cuda_runtime.h>

#include <cutil.h>


/* Example */


global static void HelloCUDA(char* result, int num)


int i = 0;

char p_HelloCUDA = “Hello CUDA!”;

for(i = 0; i < num; i++) {

result[i] = p_HelloCUDA[i];




/* HelloCUDA */


int main(int argc, char* argv)


char *device_result = 0;

char host_result[12] ={0};

CUDA_SAFE_CALL( cudaMalloc((void**) &device_result, sizeof(char) * 11));

unsigned int timer = 0;

CUT_SAFE_CALL( cutCreateTimer( &timer));

CUT_SAFE_CALL( cutStartTimer( timer));

HelloCUDA<<<1, 1, 0>>>(device_result, 11);

CUT_CHECK_ERROR(“Kernel execution failed\n”);

CUDA_SAFE_CALL( cudaThreadSynchronize() );

CUT_SAFE_CALL( cutStopTimer( timer));

printf(“Processing time: %f (ms)\n”, cutGetTimerValue( timer));

CUT_SAFE_CALL( cutDeleteTimer( timer));

CUDA_SAFE_CALL( cudaMemcpy(&host_result, device_result, sizeof(char) * 11, cudaMemcpyDeviceToHost));

printf("%s\n", host_result);

CUDA_SAFE_CALL( cudaFree(device_result));

CUT_EXIT(argc, argv);

return 0;


I suggest to check error codes from all functions, including memcpy. Maybe cuda is not initialized. Also may try
dim3 dimBlock(threadsPerBlock, 1,1);
dim3 dimGrid(blocksPerGrid, 1,1);

And you use __declspec(dllexport) in main function, do you make dll?


Yes, I am generating dll.

I used your configuration format. Unfortunately, it has nothing changed. Also, I add error checking function like this:

void cudasafe( cudaError_t error, char* message) {

if(error!=cudaSuccess) { 

	fprintf(stderr,"ERROR: %s : %i\n",message,error); 




Still nothing changed. I am using Debugging mode.

Any guess?



all functions do not return any errors?
Wonder, would your hello example work if you put it into dll?


It works.

There is no error in my code but in the setting in VS2008. I did not mark CUDA Runtime API option in Custom Build Rules list…

Thank you so much!