I have done a visual profiling on a parallel reduction code i have written recently, and this is the result:
I am not too experienced in interpreting cuda profiler results, but is it normal for the cudaMemcpy to take up more than 90% of the execution time? The array size that i attempt to reduce is 1048576. Is there any way to speed up the process, or did I use the cudaMemcpy function in the wrong way? The following are my reduction code:
[codebox]#include “mex.h”
#include “math.h”
#include “cuda.h”
#define BLOCK_SIZE 512
global void reduction0(floatx,floatpsum,int n)
{
__shared__ float sdata[BLOCK_SIZE];
unsigned int tid=threadIdx.x;
unsigned int i=blockIdx.x*2*blockDim.x+threadIdx.x;
unsigned int gridSize=BLOCK_SIZE*2*gridDim.x;
sdata[tid]=0;
while(i<n){
sdata[tid]+=x[i]+x[i+blockDim.x];
i+=gridSize;}
__syncthreads();
for(unsigned int stride=blockDim.x>>1;stride>32;stride>>=1)
{
if(tid<stride)
{
sdata[tid]+=sdata[tid+stride];
}
__syncthreads();
}
if(tid<32)
{
sdata[tid]+=sdata[tid+32];
sdata[tid]+=sdata[tid+16];
sdata[tid]+=sdata[tid+8];
sdata[tid]+=sdata[tid+4];
sdata[tid]+=sdata[tid+2];
sdata[tid]+=sdata[tid+1];
}
if(tid==0)psum[blockIdx.x]=sdata[0];
}
void mexFunction(int nlhs, mxArray *plhs,int nrhs,const mxArray *prhs)
{
float* x,*dx;
float*sum,*dpsum;
int length,blockno;
int dim[2];
x=(float*)mxGetPr(prhs[0]);
length=mxGetN(prhs[0]);
dim[0]=1;
dim[1]=64;
plhs[0]=mxCreateNumericArray(2,dim,mxSINGLE_CLASS,mxREAL);
sum=(float*)mxGetData(plhs[0]);
cudaMalloc((void**)&dx,length*sizeof(float));
cudaMalloc((void**)&dpsum,64*sizeof(float));
cudaMemcpy(dx,x,length*sizeof(float),cudaMemcpyHostToDevice)
;
reduction0<<<64,BLOCK_SIZE>>>(dx,dpsum,length);
cudaMemcpy(sum,dpsum,64*sizeof(float),cudaMemcpyDeviceToHost
);
cudaFree(dx);
cudaFree(dpsum);
return;
}[/codebox]