vector dot product Reduction method

Hi,

I am doing self-study on CUDA C, and trying to run the following vector-product program from the book on CUDA by Example.
I get zero value for the dot product.
Could anyone point out the issue with the program please.
I am using VisualStudio Express 8 to run the program on my desktop.

Rohana.

/*
Here the vector length is divided into blocks of certain lengths.
Threads add values at corresponding positions in the blocks.
These sums are then stored in the same position in array in the cache.

*/

//#include “…/common/book.h”
#include “common\book.h”
#include <stdlib.h>
#include <stdio.h>

#define imin(a,b) (a<b?a:b)

const int N = 33 * 1024; //20 * 500; //64;//This is same as vector lengths!
const int threadsPerBlock = 32;//128;//256;
//const int blocksPerGrid = imin( 32, (N+threadsPerBlock-1) / threadsPerBlock );//min of (32, 132)(Number of blocks smaller than the vector length)
const int blocksPerGrid = (int) (N+threadsPerBlock-1) / threadsPerBlock ;//min of (32, 132)(Number of blocks smaller than the vector length)

global void dot( float *a, float *b, float *c ) {
shared float cache[threadsPerBlock];
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int cacheIndex = threadIdx.x;

float   temp = 0;
while (tid < N) {
    temp += a[tid] * b[tid]; //temp is private for each thread!

    tid += blockDim.x * gridDim.x;//Not yet lauched ??
	//printf("tid=%d", tid);
}

// set the cache values
cache[cacheIndex] = temp;//Per block?

// synchronize threads in this block
__syncthreads();//Wait all threads within a block have finished thier job.

// for reductions, threadsPerBlock must be a power of 2
// because of the following code
//---- Now start adding the values in the cache, each time halving the span of the cache index
int i = blockDim.x/2;
while (i != 0) {
    if (cacheIndex < i)
        cache[cacheIndex] += cache[cacheIndex + i];
    __syncthreads();//To gurantee that the cash is filled.
     i /= 2;
   //i = i/2;
}

if (cacheIndex == 0)
    c[blockIdx.x] = cache[0];//(Move to a different array, which is on Device.)

}

int main( void ) {
float *a, *b, c, *partial_c;
float *dev_a, *dev_b, *dev_partial_c;

// Allocate memory on the cpu side
a = new float[N];//Should not have a space
b = new float[N];
partial_c =  new float[N];

//a = (float*)malloc( N*sizeof(float) );
//b = (float*)malloc( N*sizeof(float) );
//partial_c = (float*)malloc( blocksPerGrid*sizeof(float) );

// Allocate the memory on the GPU
//HANDLE_ERROR( cudaMalloc( (void**)&dev_a, N * sizeof(float) ) );
//HANDLE_ERROR( cudaMalloc( (void**)&dev_b, N * sizeof(float) ) );
//HANDLE_ERROR( cudaMalloc( (void**)&dev_partial_c, blocksPerGrid * sizeof(float) ) );
cudaMalloc( (void**)&dev_a, N * sizeof(float) );
cudaMalloc( (void**)&dev_b, N * sizeof(float) );
cudaMalloc( (void**)&dev_partial_c, blocksPerGrid * sizeof(float) );

// Fill in the host memory with data
for (int i=0; i<N; i++) {
    a[i] = i;//Note: a, b, both have the same dimesions as N!
    b[i] = i*2;
    //a[i] = (float) i;//Note: a, b, both have the same dimesions as N!
    //b[i] = (float) i*2;
	//printf("ai= %f bi=%f\n", a[i], b[i]);
}

// Copy the arrays 'a' and 'b' to the GPU
//HANDLE_ERROR( cudaMemcpy( dev_a, a, N * sizeof(float), cudaMemcpyHostToDevice ) );
//HANDLE_ERROR( cudaMemcpy( dev_b, b, N * sizeof(float), cudaMemcpyHostToDevice ) ); 
cudaMemcpy( dev_a, a, N * sizeof(float), cudaMemcpyHostToDevice );
cudaMemcpy( dev_b, b, N * sizeof(float), cudaMemcpyHostToDevice ); 

dot<<< blocksPerGrid, threadsPerBlock >>>( dev_a, dev_b, dev_partial_c );

// copy the array 'c' back from the GPU to the CPU
//HANDLE_ERROR( cudaMemcpy( partial_c, dev_partial_c, blocksPerGrid*sizeof(float), cudaMemcpyDeviceToHost ) );

cudaMemcpy( partial_c, dev_partial_c, blocksPerGrid * sizeof(float),cudaMemcpyDeviceToHost );

// finish up on the CPU side
c = 0;
for (int i=0; i < blocksPerGrid; i++) 
{
    c += partial_c[i];
	//printf("c= %f\n", c);
}
printf("cFinal= %f\n", c);

#define sum_squares(x)  (x*(x+1)*(2*x+1)/6)
printf( "Does GPU value %.6g = %.6g?\n", c, 2 * sum_squares( (float)(N - 1) ) );

// free memory on the gpu side
//HANDLE_ERROR( cudaFree( dev_a ) );
//HANDLE_ERROR( cudaFree( dev_b ) );
//HANDLE_ERROR( cudaFree( dev_partial_c ) );

// free memory on the cpu side
//free( a );
//free( b );
//free( partial_c );
cudaFree (dev_a);
cudaFree (dev_b);
cudaFree (dev_partial_c );

delete [] a;
delete [] b;
delete [] partial_c;

system("pause");

}