I have done a visual profiling on a parallel reduction code i have written recently, and this is the result:

I am not too experienced in interpreting cuda profiler results, but is it normal for the cudaMemcpy to take up more than 90% of the execution time? The array size that i attempt to reduce is 1048576. Is there any way to speed up the process, or did I use the cudaMemcpy function in the wrong way? The following are my reduction code:

[codebox]#include “mex.h”

#include “math.h”

#include “cuda.h”

#define BLOCK_SIZE 512

**global** void reduction0(float*x,float*psum,int n)

{

```
__shared__ float sdata[BLOCK_SIZE];
```

unsigned int tid=threadIdx.x;

```
unsigned int i=blockIdx.x*2*blockDim.x+threadIdx.x;
unsigned int gridSize=BLOCK_SIZE*2*gridDim.x;
sdata[tid]=0;
while(i<n){
sdata[tid]+=x[i]+x[i+blockDim.x];
i+=gridSize;}
__syncthreads();
```

for(unsigned int stride=blockDim.x>>1;stride>32;stride>>=1)

```
{
if(tid<stride)
{
sdata[tid]+=sdata[tid+stride];
}
__syncthreads();
}
```

if(tid<32)

```
{
sdata[tid]+=sdata[tid+32];
sdata[tid]+=sdata[tid+16];
sdata[tid]+=sdata[tid+8];
sdata[tid]+=sdata[tid+4];
sdata[tid]+=sdata[tid+2];
sdata[tid]+=sdata[tid+1];
}
```

if(tid==0)psum[blockIdx.x]=sdata[0];

}

void mexFunction(int nlhs, mxArray *plhs,int nrhs,const mxArray *prhs)

{

```
float* x,*dx;
float*sum,*dpsum;
int length,blockno;
int dim[2];
```

x=(float*)mxGetPr(prhs[0]);

```
length=mxGetN(prhs[0]);
```

dim[0]=1;

```
dim[1]=64;
plhs[0]=mxCreateNumericArray(2,dim,mxSINGLE_CLASS,mxREAL);
sum=(float*)mxGetData(plhs[0]);
```

cudaMalloc((void**)&dx,length*sizeof(float));

```
cudaMalloc((void**)&dpsum,64*sizeof(float));
```

cudaMemcpy(dx,x,length*sizeof(float),cudaMemcpyHostToDevice)

;

reduction0<<<64,BLOCK_SIZE>>>(dx,dpsum,length);

```
cudaMemcpy(sum,dpsum,64*sizeof(float),cudaMemcpyDeviceToHost
```

);

cudaFree(dx);

```
cudaFree(dpsum);
return;
```

}[/codebox]