Summation of 1D array of arbitrary size

I have written a code for computation of vector norm. The code gives correct result for array size of 1 to 17, but goes hay wire after that. The summation is given by the first element of the array.

[codebox]#include “mex.h”

#include “cuda.h”

#define BLOCK_SIZE 16

global void norm(float* v,int n,int i)

{

int y=threadIdx.y+blockIdx.y*BLOCK_SIZE;

if(y<n/2)

v[i*y]=v[i*y]+v[i*y+i/2];

}

global void corrector(float* v,int n,int i)

{

if(n%2!=0)

v[0]=v[0]+v[i*(n/2-1)+i];

}

void mexFunction(int nlhs, mxArray *plhs, int nrhs, const mxArray *prhs)

{

int n,N;

int dim[2];

int i,space=2;

int nBlock;

float*v,*dv;

float *sum;

n=mxGetM(prhs[0]);

if(n%2!=0)

N=n+1;

else

N=n;

v=(float*)mxGetData(prhs[0]);

dim[0]=N;

dim[1]=1;

plhs[0]=mxCreateNumericArray(1,dim,mxSINGLE_CLASS,mxREAL);

sum=(float*)mxGetData(plhs[0]);

size_t size=N*sizeof(float);

nBlock=(size-1)/BLOCK_SIZE+1;

dim3 dimBlock(1,BLOCK_SIZE);

dim3 dimGrid(1,nBlock);

cudaMalloc((void**)&dv,size);

cudaMemcpy(dv,v,size,cudaMemcpyHostToDevice);

for(i=N;i>=2;i=i/2)

{

norm<<<dimGrid, dimBlock>>>(dv,i,space);

corrector<<<dimGrid, dimBlock>>>(dv,i,space);

space=space*2;

}

cudaMemcpy(sum,dv,N*sizeof(float),cudaMemcpyDeviceToHost);

cudaFree(dv);

}

[/codebox]

May i know what’s wrong with the code?

Also, i did some search, and come across this:

http://developer.download.nvidia.com/compu…an/doc/scan.pdf

Haven’t really read through it properly yet. Can it be used to the summation?