I have written a program of sum reduction for multiple blocks. Input should be in power of 2. It works fine only when i use shared memory in the kernel. try removing shared memory in the declaration u wont get correct answer. The answer is zero because i have initialized device array as 0 in the first place.
use int perblk[512]; rather shared int perblk[512];
I dont think initializing shared memory is mandatory i have written other programs without it, please help me with this…
I am posting the code here and i have also attached the file.
[codebox]/* Program on sum reduction, for multiple blocks
- input array size should be in power of 2
*/
#include <stdio.h>
#include <cuda.h>
#include <time.h>
#define CLK_PER_SEC 1000.0
/* Program works correcly only if shared memory is used in the kernel,
else the output array will be zero */
global void redsum(int *perblocks,int *cd)
{
shared int perblk[512];
//int perblk[512];
int t=threadIdx.x;
int col=blockIdx.x*blockDim.x+threadIdx.x;
perblk[t]=perblocks[col];
__syncthreads();
for(int stride=blockDim.x/2;stride>=1;stride/=2)
{
__syncthreads();
if(t<stride)
perblk[t]+=perblk[t+stride];
}
cd[blockIdx.x]=perblk[0]; // sum of every block is stored in auxillary array
}
main()
{
int *partialsumh,*cd,*partialsumd,n,i;
float time2;
clock_t starth, endh;
cudaEvent_t start,stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
printf("Enter array size: ");
scanf(“%d”,&n); //reading array size, needs to be in power of 2
size_t size = sizeof(int)*n;
partialsumh=(int*)malloc(size);
starth=clock();
for(i=0;i<n;i++)
partialsumh[i]=i+1; // initializing array 1,2,3...
printf("\nThe array is: ");
for(i=0;i<n;i++)
printf("%d ",partialsumh[i]);
printf(“\n”);
int result=0;
for(i=0;i<n;i++)
result = result + partialsumh[i]; // finding sum by CPU
endh=clock();
float timing=endh-starth;
printf(“\nTime taken by CPU is %.2fms”,timing);
cudaMalloc((void**)&partialsumd,size);
cudaMalloc((void**)&cd,size);
cudaMemset(partialsumd,0,size); //initialize device array to 0
cudaMemset(cd,0,size); //initialize device array to 0
cudaMemcpy(partialsumd,partialsumh,size,cudaMemcpyHostToDevi
ce);
int nb=n/64+((n%64==0)?0:1); // each thread block has 64 threads
cudaEventRecord(start,0);
redsum<<<nb,64>>>(partialsumd,cd);
cudaThreadSynchronize();
cudaMemset(partialsumd,0,size);
cudaMemcpy(partialsumd,cd,size,cudaMemcpyDeviceToDevice);
cudaMemset(cd,0,size);
/* the program works for any number of inputs, the auxillary is computed
repeatedly using do while loop. this loops till array is reduced to single thread block */
do{
nb=(nb/64==0)?1:nb/64;
redsum<<<nb,64>>>(partialsumd,cd);
cudaMemset(partialsumd,0,size);
// After second computation, copy auxillary to main array
cudaMemcpy(partialsumd,cd,size,cudaMemcpyDeviceToDevice);
cudaMemset(cd,0,size);
}while(nb>=2);
//copy final answer to the host
cudaMemcpy(partialsumh,partialsumd,size,cudaMemcpyDeviceToHo
st);
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time2,start,stop);
printf("\nThe sum computed by CPU is %d ", result);
printf(“\nTime taken by GPU is %.2fms”, time2);
printf(“\nthe sum computed by GPU is %d \n\n”, partialsumh[0]);
return 0;
}[/codebox]