I am using double buffering to pass data to CUDA kernels and receive results from them, but I don’t get any results back. Why is that?
Code snippets (GpuArray is from the smokeParticles demo, I used it to make accessing CUDA easier):
GpuArray<double2> xyStream [2];
GpuArray<double> zStream [2];
double2* xyBuf = NULL;
double* zBuf = NULL;
int nDestXY, nDestZ;
//------------------------------------------------------------------------------
int WriteBuffers (long pointCount, int nStride, double *x, double *y, double *z)
{
for (int i = 0, j = 0; i < pointCount; i++, j += nStride) {
xyBuf [i].x = x [j];
xyBuf [i].y = y [j];
}
if (!z)
memset (zBuf, 0, pointCount * sizeof (double));
else {
if (nStride < 2)
memcpy (zBuf, z, sizeof (double) * pointCount);
for (int i = 0, j = 0; i < pointCount; i++, j += nStride)
zBuf [i] = z [j];
}
return 0;
}
//------------------------------------------------------------------------------
int CreateBuffers (long pointCount, int nStride, double *x, double *y, double *z)
{
xyBuf = new double2 [pointCount];
zBuf = new double [pointCount];
nDestXY = 0;
nDestZ = 0;
WriteBuffers (pointCount, nStride, x, y, z);
return 0;
}
//------------------------------------------------------------------------------
void WriteStreams (long pointCount)
{
memcpy (xyStream [!nDestXY].getHostPtr (), xyBuf, sizeof (double2) * pointCount);
memcpy (zStream [!nDestZ].getHostPtr (), zBuf, sizeof (double) * pointCount);
xyStream [!nDestXY].copy (GpuArray<double2>::HOST_TO_DEVICE, 0, pointCount);
zStream [!nDestZ].copy (GpuArray<double>::HOST_TO_DEVICE, 0, pointCount);
}
//------------------------------------------------------------------------------
int CreateStreams (long pointCount)
{
nDestXY = 0;
nDestZ = 0;
xyStream [0].alloc (pointCount, false, false, false);
xyStream [1].alloc (pointCount, false, false, false);
zStream [0].alloc (pointCount, false, false, false);
zStream [1].alloc (pointCount, false, false, false);
WriteStreams (pointCount);
return 0;
}
//------------------------------------------------------------------------------
void ReadStreams (long pointCount)
{
xyStream [nDestXY].copy (GpuArray<double2>::DEVICE_TO_HOST, 0, pointCount);
zStream [nDestZ].copy (GpuArray<double>::DEVICE_TO_HOST, 0, pointCount);
memcpy (xyBuf, xyStream [nDestXY].getHostPtr (), sizeof (double2) * pointCount);
memcpy (zBuf, zStream [nDestZ].getHostPtr (), sizeof (double) * pointCount);
}
//------------------------------------------------------------------------------
int pj_transform (CPJParams *srcdefn, CPJParams *dstdefn, long pointCount, long nStride, double *x, double *y, double *z, bool bUseStreams)
{
CreateBuffers (pointCount, nStride, x, y, z);
CreateStreams (pointCount);
...
__global__ static
void pj_inv_pre_stream (double2* lp, double2* xy, double to_meter, double x0, double y0, double ra)
{
const unsigned int xIdx = threadIdx.x;
xy [xIdx].x = (lp [xIdx].x * to_meter - x0) * ra;
xy [xIdx].y = (lp [xIdx].y * to_meter - y0) * ra;
}
void pj_inv_pre (double2* lp, double2* xy, double to_meter, double x0, double y0, double ra, unsigned int nPoints)
{
dim3 grid (1, 1, 1);
dim3 threads (nPoints, 1, 1);
pj_inv_pre_stream<<<grid, threads>>> (lp, xy, to_meter, x0, y0, ra);
}
nDestXY = !nDestXY; // switch buffers
pj_inv_pre (xyStream [!nDestXY].getDevicePtr (), xyStream [nDestXY].getDevicePtr (), P->to_meter, P->x0, P->y0, P->ra, nPoints);
ReadStreams ((long) nPoints);
The data in the buffer referenced by xyStream [0].getHostPtr() always is the initial data, and the data in xyStream [1].getHostPtr(), which should contain the computation results after having called pj_inv_pre() is always undefined. I have no clue why.
(The equivalent ATI Stream Computing code works w/o problems btw.)