D2D_CUDA8::D2D_CUDA8(int nGPUIdx)
{
err = cudaSetDeviceFlags(cudaDeviceBlockingSync);
err = cudaSetDeviceFlags(cudaDeviceScheduleYield);
cudaSetDevice(nGPUIdx);
}
void D2D_CUDA8::RDMASetMemory(SIZE_T size)
{
err = cudaMallocHost((void**)&RDMA_Memory, size);
}
void D2D_CUDA8::CopyMemoryToDevice(int* Source, SIZE_T W, SIZE_T H)
{
for (SIZE_T i = 0; i < H; i++)
{
err = cudaMemcpyAsync(RDMA_Memory + i * W, Source + i * W, W * sizeof(int), cudaMemcpyHostToDevice);
}
}
void D2D_CUDA8::CopyMemoryToHost(int* Source, SIZE_T size)
{
err = cudaMemcpyAsync(Source, RDMA_Memory, size, cudaMemcpyDeviceToHost);
}
int main()
{
cuda = new D2D_CUDA8(1);
for (int i = 0; i < 100; i++)
{
unsafe
{
cuda.CopyMemoryToDevice((int*)m_mainMemory.GetPtr().ToPointer(), (ulong)(40000 * 40000), (ulong)m_mainMemory.H);
cuda.CopyMemoryToHost((int*)m_subMemory.GetPtr().ToPointer(), (ulong)(40000 * 40000));
}
}
}
I made a code to copy an image of size 40000*40000 from mainMemory to Device, and then from Device to subMemory again. The function of copying works well.
(In order to visually check the load rate in the task manager, it was repeated about 100 times)
I thought that this copying process was done through DMA.
However, this code constantly generates a cpu load of approximately 10 to 20%.
I’ve done nothing else but copy the memory, but is this copying process done by GPU through DMA, but with the help of CPU?
Can’t I just do this copy process with GPU without CPU load?
My program is already under a lot of CPU load, so I don’t want additional CPU load on cudaMemcpy process