Wrong outputs for SUMMATION example SUMMATION problem in CUDA

Hello Fellow Cuda Guys and Gals,

I would really appreciate if one of you spare some time and look at my code, as I believe that there is a fundamental flaw that I cannot find. The attached program does a summation of number 0 through 15. I can easily run the code in emulation mode, but when I run it in resgular device mode, then the output is either 0.000 or nan. I totally don’t get why such a simple program is driving me crazy. I am also a little new to the CUDA and have read a few topics on it and have search through the forums for help, but I still cannot find the bug. Again, I would be humbled if any one of you intelligent CUDA guys/gals help me. Please ask me questions … I am able to assist and will be monitoring this post occasionally.


terminal output:

:~/CUDA_sample_cs414$ nvcc -o test.out vector_add.cu -lcutil -I /usr/local/cuda/NVIDIA_GPU_Computing_SDK/C/common/inc/ -L /usr/local/cuda/NVIDIA_GPU_Computing_SDK/C/lib
./vector_add.cu(52): Warning: Cannot tell what pointer points to, assuming global memory space
:~/CUDA_sample_cs414$ ./test.out
Bag generation: 0.00000 sec
0.000000 1.000000 2.000000 3.000000 4.000000 5.000000 6.000000 7.000000 8.000000 9.000000 10.000000 11.000000 12.000000 13.000000 14.000000 15.000000Here is the output of the progam 0.000000
The sum is: 0.000000


#include <stdio.h>
#include <sys/time.h>

// The number of threads per blocks in the kernel
// if we define it here, then we can use its value in the kernel,
// for example to statically declare an array in shared memory)

// Forward function declarations
float *GPU_add_vectors(float *A, float *B, int N);
float *GPU_sum_vectors(struct Bag * bag, unsigned int N);
float *CPU_add_vectors(float *A, float *B, int N);
float *get_random_vector(int N);
long long start_timer();
long long stop_timer(long long start_time, char *name);
void die(char *message);
unsigned int N = 16;

struct Bag {
unsigned int itemCount;
float *data;
} ;

struct Bag * initBag() {
struct Bag * bag;
bag = (Bag*)malloc(sizeof(struct Bag));
bag->itemCount = N;
bag->data=(float*) malloc(N*sizeof(float));
return bag;

void fill_bag (struct Bag * temp, unsigned int size ){

for(int i = 0;  i < size ; i++) {
temp->data[i] = i;

temp->itemCount = size;

global void GPU_bagfrequency(struct Bag *c_bag, float * c_result){

__shared__ float sum[16];

float temp[16];

unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
sum[tid] = c_bag->data[i];


	for(unsigned int s = 16/2; s > 0 ; s >>= 1){
		if( tid < s) {
		sum[tid] += sum[tid + s];
	float t = sum[tid]+sum[tid^s];
	sum[tid] = t;

if(tid ==0){
 temp[blockIdx.x] = sum[tid];
*c_result = temp[blockIdx.x];


int main(int argc, char **argv) {
// Seed the random generator (use a constant here for repeatable results)

// Determine the vector length
N = 16;  // default value
if (argc > 1) N = atoi(argv[1]); // user-specified value

// Generate two random vectors
long long vector_start_time = start_timer();
struct Bag *A = initBag();
struct Bag *B = initBag();

//Fill the two bags with data
fill_bag(A, N);
fill_bag(B, N);
stop_timer(vector_start_time, "Bag generation");

// Compute their sum on the GPU
long long GPU_start_time = start_timer();
//float *C_GPU = GPU_add_vectors(A, B, N);
float *C_GPU = GPU_sum_vectors(B,N);
printf("Here is the output of the progam %f \n", *C_GPU);
//long long GPU_time = stop_timer(GPU_start_time, "\t Total \n");
printf("The sum is: %f \n ",C_GPU);

// Compute their sum on the CPU
long long CPU_start_time = start_timer();
float *C_CPU = CPU_add_vectors(A, B, N);
long long CPU_time = stop_timer(CPU_start_time, “\nCPU”);

// Compute the speedup or slowdown
if (GPU_time > CPU_time) printf("\nCPU outperformed GPU by %.2fx\n", (float) GPU_time / (float) CPU_time);
else                     printf("\nGPU outperformed CPU by %.2fx\n", (float) CPU_time / (float) GPU_time);

// Check the correctness of the GPU results
int num_wrong = 0;
for (int i = 0; i < N; i++) {
	if (fabs(C_CPU[i] - C_GPU[i]) > 0.000001) num_wrong++;

// Report the correctness results
if (num_wrong) printf("\n%d / %d values incorrect\n", num_wrong, N);
else           printf("\nAll values correct\n");


float *GPU_sum_vectors(struct Bag * bag, unsigned int N) {

for(int i = 0;  i < N ; i++) {
printf(" %f", bag->data[i]);

float * result_in = (float*) malloc(sizeof(float)*N);

// Allocate GPU memory for the inputs and the result
struct Bag * c_bag;
float * c_result;
cudaMalloc((void**) &c_bag, sizeof(Bag)*N); 
cudaMalloc((void**) &c_result, sizeof(float)*N);

// Make sure the allocations were successful
if(!(c_bag && c_result)) printf("Error allocating GPU memory");

// Transfer the input vectors to GPU memory
cudaMemcpy((void*)c_result,(void*) result_in, sizeof(float)*N, cudaMemcpyHostToDevice);
cudaMemcpy((void*)c_bag, (void*)bag, sizeof(Bag)*N, cudaMemcpyHostToDevice);

//size the threads and grids
dim3 dimGrid(1,1,1);
dim3 dimBlock(16,1,1);

//initialize threads and grids
//grid.x = 1;
//threads.x = N;
//initialize and start timer
timer = 0;

//printf("Item count : %i", bag->itemCount);
GPU_bagfrequency <<<dimGrid, dimBlock>>> (c_bag, c_result);
cudaThreadSynchronize();  // this is only needed for timing purposes

//allocate memory for the result on the host 	
float * result_out = (float*) malloc(sizeof(float));

cudaMemcpy(result_out, c_result,sizeof(float)*N, cudaMemcpyDeviceToHost);


//return frequency result to the caller
return (result_out);


// Returns the current time in microseconds
long long start_timer() {
struct timeval tv;
gettimeofday(&tv, NULL);
return tv.tv_sec * 1000000 + tv.tv_usec;

// Prints the time elapsed since the specified time
long long stop_timer(long long start_time, char *name) {
struct timeval tv;
gettimeofday(&tv, NULL);
long long end_time = tv.tv_sec * 1000000 + tv.tv_usec;
printf("%s: %.5f sec\n", name, ((float) (end_time - start_time)) / (1000 * 1000));
return end_time - start_time;

// Prints the specified message and quits
void die(char *message) {
printf("%s\n", message);