Hi,
try this
[codebox]
omp_set_num_threads(num_used_gpu);
#pragma omp parallel
{
#pragma omp sections
{
{
unsigned int cpu_thread_id = omp_get_thread_num();
unsigned int num_cpu_threads = omp_get_num_threads();
int gpu_id = -1;
cudaSetDevice(cpu_thread_id);
cudaGetDevice(&gpu_id);
printf("M CPU thread %d (of %d) uses CUDA device %d\n", cpu_thread_id, num_cpu_threads, gpu_id);
float *SD,*SH;
cudaMallocHost((void **)&SH, NpoinT*sizeof(float));
cudaMalloc((void **)&SD, NpoinT*sizeof(float));
#pragma omp barrier
t=cpu_time();
for(int tim=0;tim<10000;tim++){
cudaMemcpy(SH,SD, NpoinT*sizeof(float), cudaMemcpyDeviceToHost);
} // End cicle
cudaThreadSynchronize();
printf("\n---%lf---\n",cpu_time()-t);
cudaFree(SD);
cudaFreeHost(SH);
} // End first section
///////////////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////////////////////////////////////
#pragma omp section
{
unsigned int cpu_thread_id = omp_get_thread_num();
unsigned int num_cpu_threads = omp_get_num_threads();
int gpu_id = -1;
cudaSetDevice(cpu_thread_id);
cudaGetDevice(&gpu_id);
printf("S CPU thread %d (of %d) uses CUDA device %d\n", cpu_thread_id, num_cpu_threads, gpu_id);
float *SD,*SH;
cudaMallocHost((void **)&SH, NpoinT*sizeof(float));
cudaMalloc((void **)&SD, NpoinT*sizeof(float));
#pragma omp barrier
t=cpu_time();
for(int tim=0;tim<10000;tim++){
cudaMemcpy(SH,SD, NpoinT*sizeof(float), cudaMemcpyDeviceToHost);
} // End cicle
cudaThreadSynchronize();
printf("\n---%lf---\n",cpu_time()-t);
cudaFreeHost(SH);
cudaFree(SD);
} // End second section
//////////////////////////////////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////////////////
} // End sections
} // End parallel
[/codebox]