I am using Jetson TX2 board to do fft.I use zero copy data from host to device and then do fft or ifft.After fft or ifft I have to copy these data from device to host.I try to use zero copy to do the memory copy.However, the cope speed is very slow(at least 6 times slower than normal cudamemcpy ).So the whole memory copy and fft turn to slower after I used the zero copy. In my inderstanding the gpu and cpu in Jetson TX2 use same memory.The zero copy should be faster.But zero copy is extremely slow.
Here is my basic code:
int main(void)
{
size_t size = 11000000000 * sizeof(int);
int data;
cudaHostAlloc((void**) &data, size, cudaHostAllocMapped);
checkCUDAError(“cudaHostAlloc data”); memset(data, 0, 1*1000000000 * sizeof(int));
int *gpudata;
cudaHostGetDevicePointer(&gpudata, data, 0);
checkCUDAError(“cudaHostGetDevicePointer”);
sumNum<<<1000000000/1024+1023, 1024>>>(gpudata);
cudaDeviceSynchronize();
for (int i = 99999999; i < 1000000000; i=i+100000000)
{
printf(“%d \n”, data[i]); }
cudaFreeHost(data);
return 0;
}
}