I have written a code for computation of vector norm. The code gives correct result for array size of 1 to 17, but goes hay wire after that. The summation is given by the first element of the array.
[codebox]#include “mex.h”
#include “cuda.h”
#define BLOCK_SIZE 16
global void norm(float* v,int n,int i)
{
int y=threadIdx.y+blockIdx.y*BLOCK_SIZE;
if(y<n/2)
v[i*y]=v[i*y]+v[i*y+i/2];
}
global void corrector(float* v,int n,int i)
{
if(n%2!=0)
v[0]=v[0]+v[i*(n/2-1)+i];
}
void mexFunction(int nlhs, mxArray *plhs, int nrhs, const mxArray *prhs)
{
int n,N;
int dim[2];
int i,space=2;
int nBlock;
float*v,*dv;
float *sum;
n=mxGetM(prhs[0]);
if(n%2!=0)
N=n+1;
else
N=n;
v=(float*)mxGetData(prhs[0]);
dim[0]=N;
dim[1]=1;
plhs[0]=mxCreateNumericArray(1,dim,mxSINGLE_CLASS,mxREAL);
sum=(float*)mxGetData(plhs[0]);
size_t size=N*sizeof(float);
nBlock=(size-1)/BLOCK_SIZE+1;
dim3 dimBlock(1,BLOCK_SIZE);
dim3 dimGrid(1,nBlock);
cudaMalloc((void**)&dv,size);
cudaMemcpy(dv,v,size,cudaMemcpyHostToDevice);
for(i=N;i>=2;i=i/2)
{
norm<<<dimGrid, dimBlock>>>(dv,i,space);
corrector<<<dimGrid, dimBlock>>>(dv,i,space);
space=space*2;
}
cudaMemcpy(sum,dv,N*sizeof(float),cudaMemcpyDeviceToHost);
cudaFree(dv);
}
[/codebox]
May i know what’s wrong with the code?
Also, i did some search, and come across this:
http://developer.download.nvidia.com/compu…an/doc/scan.pdf
Haven’t really read through it properly yet. Can it be used to the summation?