Hi,everyone,
I have got a problem when use cuda stream with CULA function,the following is the whole pseudocode:
for (int i = 0; i < nstreams; i++)
{
checkCudaErrors(cudaMemcpyAsync(dev_X + i*m*n, h_X + i*m*n, m*n * sizeof(float), cudaMemcpyHostToDevice, streams[i]));
dim3 sumGrid(4, m);
dim3 sumBlock(1024, 1);
int sharedSize = sumBlock.x * sizeof(float);
sumReduction_kernel << <sumGrid, sumBlock, sharedSize, streams[i] >> > (dev_Xmean + i*m, dev_X + i*m*n, m, n);
}
for (int i = 0; i < nstreams; i++)
{
sub1_kernel << <sumGrid, sumBlock, 0, streams[i] >> > (dev_XFinal + i*m*n, dev_X + i*m*n, dev_Xmean + i*m, m, n);
}
[b]for (int i = 0; i < nstreams; i++)
{
checkCudaErrors(cudaMemcpyAsync(h_Xfinal + i*m*n, dev_XFinal + i*m*n, sizeof(float) * m * n, cudaMemcpyDeviceToHost, streams[i]));
}[/b]
for (int i = 0; i < nstreams; i++)
{
status = culaDeviceSgemm();
checkStatus(status);
[b]status = culaDeviceSgetrf();
checkStatus(status);
[/b]
status = culaDeviceSgetri();
checkStatus(status);
printf("%s\n", "CULA inverse had done!");
status = culaDeviceSgemm();
checkStatus(status);
}
In this pseudocode, if there is no memcpy of h_Xfinal(which has bold ), there will be a following error when execute culaDeviceSgetrf():
“CULA Dense : Data error at pos 1 (see the reference Manual for guidance)”
And if I allocate h_Xfinal with pinned memory, there will be the same error.
I mean what can I use to instead the memcpy of D2H, (there it’s useless and time consumed).
Thank you~