Hello All,
I have the device Tegra Xavier (architecture 7.2). And since this device has one memory for the CPU and GPU, I am trying to use it. The mapped pointer works well with the kernel, behaving just like a pointer to GPU memory. But when I try to use it when copying, it is very slow, as if several copies are being made through the CPU. Maybe someone knows why this is happening, is it the way it was intended, or a defect in the NVIDIA drivers?
cudaError err = cudaSuccess;
int size = 10 * 1024 * 1024;
unsigned char *h_in = nullptr, *h_out = nullptr;
err = cudaHostAlloc((void**)&h_in, size, cudaHostAllocMapped); cuda_err(err)
err = cudaHostAlloc((void**)&h_out, size, cudaHostAllocMapped); cuda_err(err)
unsigned char *d_out = nullptr, *d_in = nullptr;
err = cudaHostGetDevicePointer((void**)&d_in, (void*)h_in, 0); cuda_err(err)
err = cudaHostGetDevicePointer((void**)&d_out, (void*)h_out, 0); cuda_err(err)
unsigned char *d_out1 = nullptr, *d_in1 = nullptr;
err = cudaMalloc((void**)&d_in1, size); cuda_err(err)
err = cudaMalloc((void**)&d_out1, size); cuda_err(err)
// ..
for (size_t m = 0; m < 4; m++)
{
for (size_t i = 0; i < N; i++)
{
if (m % 2 == 0)
{
err = cudaMemcpy(d_out, d_in, size, cudaMemcpyDeviceToDevice); cuda_err(err)
}
else
{
err = cudaMemcpy(d_out1, d_in1, size, cudaMemcpyDeviceToDevice); cuda_err(err)
}
}
}
// ...
Results:
m = 0: time = 2081.80 (ms) fps = 480.35 data_rate = 4803.54 (MB/s)
m = 1: time = 16.17 (ms) fps = 61827.35 data_rate = 618273.46 (MB/s)
m = 0: time = 2125.81 (ms) fps = 470.41 data_rate = 4704.10 (MB/s)
m = 1: time = 15.37 (ms) fps = 65077.51 data_rate = 650775.13 (MB/s)