Hi,
I had to face with problem that associated with parallel utilization multiple GPU
test code
[codebox]#pragma omp parallel
{
#pragma omp sections
{
cudaSetDevice(cpu_thread_id);
cudaMallocHost((void **)&H1, str*sizeof(double));
cudaMalloc((void **)&D1, str*sizeof(double));
cudaMemcpy(D1,H1, str*sizeof(double), cudaMemcpyHostToDevice);
cudaFree(H1);
cudaFree(D1);
}
#pragma omp section
{
cudaSetDevice(cpu_thread_id);
cudaMallocHost((void **)&H2, str*sizeof(double));
cudaMalloc((void **)&D2, str*sizeof(double));
cudaMemcpy(D2,H2, str*sizeof(double), cudaMemcpyHostToDevice);
cudaFree(H2);
cudaFree(D2);
}
}
[/codebox]
this implementation work in sequential
copy HostToDevice occur one by one
Is it possible for this model to work in parallel?