# vector dot product Reduction method

Hi,

I am doing self-study on CUDA C, and trying to run the following vector-product program from the book on CUDA by Example.
I get zero value for the dot product.
Could anyone point out the issue with the program please.
I am using VisualStudio Express 8 to run the program on my desktop.

Rohana.

/*
Here the vector length is divided into blocks of certain lengths.
These sums are then stored in the same position in array in the cache.

*/

//#include “…/common/book.h”
#include “common\book.h”
#include <stdlib.h>
#include <stdio.h>

#define imin(a,b) (a<b?a:b)

const int N = 33 * 1024; //20 * 500; //64;//This is same as vector lengths!
//const int blocksPerGrid = imin( 32, (N+threadsPerBlock-1) / threadsPerBlock );//min of (32, 132)(Number of blocks smaller than the vector length)
const int blocksPerGrid = (int) (N+threadsPerBlock-1) / threadsPerBlock ;//min of (32, 132)(Number of blocks smaller than the vector length)

global void dot( float *a, float *b, float *c ) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;

``````float   temp = 0;
while (tid < N) {
temp += a[tid] * b[tid]; //temp is private for each thread!

tid += blockDim.x * gridDim.x;//Not yet lauched ??
//printf("tid=%d", tid);
}

// set the cache values
cache[cacheIndex] = temp;//Per block?

// synchronize threads in this block

// for reductions, threadsPerBlock must be a power of 2
// because of the following code
//---- Now start adding the values in the cache, each time halving the span of the cache index
int i = blockDim.x/2;
while (i != 0) {
if (cacheIndex < i)
cache[cacheIndex] += cache[cacheIndex + i];
__syncthreads();//To gurantee that the cash is filled.
i /= 2;
//i = i/2;
}

if (cacheIndex == 0)
c[blockIdx.x] = cache;//(Move to a different array, which is on Device.)
``````

}

int main( void ) {
float *a, *b, c, *partial_c;
float *dev_a, *dev_b, *dev_partial_c;

``````// Allocate memory on the cpu side
a = new float[N];//Should not have a space
b = new float[N];
partial_c =  new float[N];

//a = (float*)malloc( N*sizeof(float) );
//b = (float*)malloc( N*sizeof(float) );
//partial_c = (float*)malloc( blocksPerGrid*sizeof(float) );

// Allocate the memory on the GPU
//HANDLE_ERROR( cudaMalloc( (void**)&dev_a, N * sizeof(float) ) );
//HANDLE_ERROR( cudaMalloc( (void**)&dev_b, N * sizeof(float) ) );
//HANDLE_ERROR( cudaMalloc( (void**)&dev_partial_c, blocksPerGrid * sizeof(float) ) );
cudaMalloc( (void**)&dev_a, N * sizeof(float) );
cudaMalloc( (void**)&dev_b, N * sizeof(float) );
cudaMalloc( (void**)&dev_partial_c, blocksPerGrid * sizeof(float) );

// Fill in the host memory with data
for (int i=0; i<N; i++) {
a[i] = i;//Note: a, b, both have the same dimesions as N!
b[i] = i*2;
//a[i] = (float) i;//Note: a, b, both have the same dimesions as N!
//b[i] = (float) i*2;
//printf("ai= %f bi=%f\n", a[i], b[i]);
}

// Copy the arrays 'a' and 'b' to the GPU
//HANDLE_ERROR( cudaMemcpy( dev_a, a, N * sizeof(float), cudaMemcpyHostToDevice ) );
//HANDLE_ERROR( cudaMemcpy( dev_b, b, N * sizeof(float), cudaMemcpyHostToDevice ) );
cudaMemcpy( dev_a, a, N * sizeof(float), cudaMemcpyHostToDevice );
cudaMemcpy( dev_b, b, N * sizeof(float), cudaMemcpyHostToDevice );

dot<<< blocksPerGrid, threadsPerBlock >>>( dev_a, dev_b, dev_partial_c );

// copy the array 'c' back from the GPU to the CPU
//HANDLE_ERROR( cudaMemcpy( partial_c, dev_partial_c, blocksPerGrid*sizeof(float), cudaMemcpyDeviceToHost ) );

cudaMemcpy( partial_c, dev_partial_c, blocksPerGrid * sizeof(float),cudaMemcpyDeviceToHost );

// finish up on the CPU side
c = 0;
for (int i=0; i < blocksPerGrid; i++)
{
c += partial_c[i];
//printf("c= %f\n", c);
}
printf("cFinal= %f\n", c);

#define sum_squares(x)  (x*(x+1)*(2*x+1)/6)
printf( "Does GPU value %.6g = %.6g?\n", c, 2 * sum_squares( (float)(N - 1) ) );

// free memory on the gpu side
//HANDLE_ERROR( cudaFree( dev_a ) );
//HANDLE_ERROR( cudaFree( dev_b ) );
//HANDLE_ERROR( cudaFree( dev_partial_c ) );

// free memory on the cpu side
//free( a );
//free( b );
//free( partial_c );
cudaFree (dev_a);
cudaFree (dev_b);
cudaFree (dev_partial_c );

delete [] a;
delete [] b;
delete [] partial_c;

system("pause");
``````

}