Hello,
I’m having a problem now.
I use host mapping and stream API.
It is, in order to exceed the wall of the device memory size.
And faster?
init
{
cudaSetDevice(*DeviceID)
cudaSetDeviceFlags(cudaDeviceMapHost);
・
・
}
processing device memory case
{
cudaMemcpyAsync(dst, src, memsize, cudaMemcpyHostToDevice, stream);
kernel1<<< , 0,stream>>>();
kernel2<<< , 0,stream>>>();
cudaMemcpyAsync(dst, src, memsize, cudaMemcpyDeviceToHost, stream);
cudaStreamSynchronize(stream);
}
processing host mapped memory case
{
cudaHostRegister();
cudaHostGetDevicePointer();
kernel1<<< , 0,stream>>>();
kernel2<<< , 0,stream>>>();
cudaStreamSynchronize(stream);
cudaHostUnregister();
}
It may not work cudaMemcpyAsync()
memsize=10467126 NG
memsize=11520000 OK
There a limitation of size, aligned and MapHost?
my environment
CUDA5.0 + Capability 3.0 GPU + VS2010
Driver Vrsion 327.23