Timeline mis-aligned between MemCpy and Compute Kernels in Visual profile

It seems timeline of Memcpy goes slower than that of Compute,
even when each API calls are synchronized to the host thread.
You can see the overlap between MemCpy and kernel computing in the attached picture.

It makes me unable to debug the code. Any idea to solve this problem.

the code file bug.cu is shown as follows, and is compiled with this cmdline.
$ nvcc -ftz true -O3 -gencode arch=compute_20,code=sm_20 -o bug bug.cu -lcublas -lcurand

#include <cuda.h>
#include <curand.h>
#include <cublas_v2.h>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/device_ptr.h>

int main()
{
cublasHandle_t hd;
curandGenerator_t rng;
cublasCreate(&hd);
curandCreateGenerator(&rng, CURAND_RNG_PSEUDO_MTGP32);

 const size_t m = 5000, n = 1000;
 const double alpha = 1.0;
 const double beta = 0.0;

 thrust::host_vector h(n * m, 0.1);
 thrust::device_vector a(m * n, 0.1);
 thrust::device_vector b(n * m, 0.1);
 thrust::device_vector c(m * m, 0.1);
 cudaDeviceSynchronize();

 for (int i = 0; i < 10; i++)
 {
     curandGenerateUniformDouble(rng,
             thrust::raw_pointer_cast(&a[0]), a.size());
     cudaDeviceSynchronize();

     thrust::copy(h.begin(), h.end(), b.begin());
     cudaDeviceSynchronize();

     cublasDgemm(hd, CUBLAS_OP_N, CUBLAS_OP_N,
             m, m, n, &alpha,
             thrust::raw_pointer_cast(&a[0]), m,
             thrust::raw_pointer_cast(&b[0]), n,
             &beta,
             thrust::raw_pointer_cast(&c[0]), m);
     cudaDeviceSynchronize();
 }

 curandDestroyGenerator(rng);
 cublasDestroy(hd);

 return 0;

}