i am newbie in cuda programming (on QuadraFX5800, 4GO, Win7, 64)

I try matrix multiplication. It works well BUT

when analysing performances with nsight, i realize that copying from device to host was 200x longer than host to device !

is it normal ?

```
Nsight report :
cuMemcpyDtoH_v2 1 738 671 Âµs for a 2000 x 2000 matrix
cuMemcpyHtoD_v2 9 317 Âµs for a 2000 x 2000 matrix
```

code:

unsigned int M_DIM_LIG=2000;

unsigned int M_DIM_COL=2000;

unsigned int P_DIM_LIG=M_DIM_LIG;

unsigned int P_DIM_COL=M_DIM_COL;

…

float *matM= new float[M_DIM_LIG * M_DIM_COL];
float matPGPU= new float[P_DIM_LIG * P_DIM_COL];
…
cudaMalloc((void **) &devM, M_DIM_COLM_DIM_LIG*sizeof(float));

cudaMalloc((void **) &devN, N_DIM_COL

*N_DIM_LIG*sizeof(float));

cudaMalloc((void **) &devP, P_DIM_COL

*P_DIM_LIG*sizeof(float));

…

cudaMemcpy(devM,matM, M_DIM_COL

*M_DIM_LIG*sizeof(float),cudaMemcpyHostToDevice); // 9 317 Âµs

…

cudaMemcpy(matPGPU,devP,P_DIM_COL

*P_DIM_LIG*sizeof(float),cudaMemcpyDeviceToHost); // 1 738 671 Âµs