#define SUBVOL_DIM 128
cudaExtent volSize = make_cudaExtent(...);
cudaArray *d_volArray = 0; // subvolume bound to texture
cudaPitchedPtr d_volPPtr;// subvolume in device memory
float* h_vol = NULL; // The full volume on host
int iDivUp(int a, int b)
{
return ((a % b) != 0)? (a / b + 1): (a / b);
}
//Initialization and mem allocations
void initCuda()
{
...
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
cudaExtent subvolSize = make_cudaExtent(SUBVOL_DIM, SUBVOL_DIM, SUBVOL_DIM);
CUDA_SAFE_CALL(cudaMalloc3DArray(&d_volArray, &channelDesc, subvolSize));
...
cudaExtent pitchedVolSize = make_cudaExtent(SUBVOL_DIM*sizeof(float), SUBVOL_DIM, SUBVOL_DIM);
CUDA_SAFE_CALL(cudaMalloc3D(&d_volPPtr, pitchedVolSize));
...
}
Host to array copy:
void copy3DHostToArray(float *_src, cudaArray *_dst, cudaExtent copy_extent, cudaPos src_offset)
{
cudaMemcpy3DParms copyParams = {0};
float *h_source = _src + src_offset.x + src_offset.y*volSize.width + src_offset.z*volSize.width*volSize.height;
copyParams.srcPtr = make_cudaPitchedPtr((void*)h_source, volSize.width*sizeof(float), volSize.width, volSize.height);
copyParams.dstArray = _dst;
copyParams.kind = cudaMemcpyHostToDevice;
copyParams.extent = copy_extent;
CUDA_SAFE_CALL(cudaMemcpy3D(©Params));
CUT_CHECK_ERROR("Host -> Array Memcpy failed\n");
}
Device mem to array copy:
void copy3DMemToArray(cudaPitchedPtr _src, cudaArray *_dst)
{
cudaMemcpy3DParms copyParams = {0};
copyParams.srcPtr = _src;
copyParams.dstArray = _dst;
copyParams.kind = cudaMemcpyDeviceToDevice;
copyParams.extent = make_cudaExtent(SUBVOL_DIM, SUBVOL_DIM, SUBVOL_DIM);
CUDA_SAFE_CALL(cudaMemcpy3D(©Params));
CUT_CHECK_ERROR("Mem -> Array Memcpy failed\n");
}
Device mem to host mem copy:
void copy3DMemToHost(cudaPitchedPtr _src, float *_dst, cudaExtent copy_extent, cudaExtent dst_extent, cudaPos src_offset, cudaPos dst_offset)
{
cudaMemcpy3DParms copyParams = {0};
copyParams.srcPtr = _src;
float *h_target = _dst + dst_offset.x + dst_offset.y*dst_extent.width + dst_offset.z*dst_extent.width*dst_extent.height;//For some reason, using copyParams.dstPos doesn't give correct results, so we set the offset here.
copyParams.dstPtr = make_cudaPitchedPtr((void*)h_target, dst_extent.width*sizeof(float), dst_extent.width, dst_extent.height);
copyParams.kind = cudaMemcpyDeviceToHost;
copyParams.extent = make_cudaExtent(copy_extent.width*sizeof(float), copy_extent.height, copy_extent.depth);
copyParams.srcPos = make_cudaPos(src_offset.x*sizeof(float), src_offset.y, src_offset.z); // We want to copy copy_extent sized volume starting at (x_off, y_off, z_off).
CUDA_SAFE_CALL(cudaMemcpy3D(©Params));
CUT_CHECK_ERROR("Mem -> Host Memcpy failed\n");
}
Memory management (note that there is a one voxel border around every subvolume which is shared with other subvolumes):
cudaExtent subvolIndicesExtents = make_cudaExtent(iDivUp(volSize.width-2, SUBVOL_DIM-2), iDivUp(volSize.height-2, SUBVOL_DIM-2), iDivUp(volSize.depth-2, SUBVOL_DIM-2));
for(int _z = 0; _z< subvolIndicesExtents.depth; _z++)
for(int _y = 0; _y< subvolIndicesExtents.height; _y++)
for(int _x = 0; _x< subvolIndicesExtents.width; _x++)
{
//copy the subvolume to texture
copy3DHostToArray(h_vol, d_volArray, make_cudaExtent(SUBVOL_DIM, SUBVOL_DIM, SUBVOL_DIM), make_cudaPos(_x*(SUBVOL_DIM-2), _y*(SUBVOL_DIM-2), _z*(SUBVOL_DIM-2)));
//fprintf(stderr, "->%s", cudaGetErrorString(cudaGetLastError()));
//run a kernel on subvolume. reads from texture (via d_volArray)and writes to d_volPPtr
d_kernel<<<gridSize, blockSize>>>(d_volPPtr, ...);
CUT_CHECK_ERROR("Kernel failed");
//fprintf(stderr, "---%d-%d-%d %s---", _x, _y, _z, cudaGetErrorString(cudaGetLastError()));
cudaThreadSynchronize();
//Copy results back to host mem from device mem
dst_off.x = 1 + _x*(SUBVOL_DIM-2); dst_off.y = 1 + _y*(SUBVOL_DIM-2); dst_off.z = 1 + _z*(SUBVOL_DIM-2);
copy3DMemToHost(d_volPPtr, h_phi, copyvol, volSize, src_off, dst_off);
//fprintf(stderr, "%s<-\n", cudaGetErrorString(cudaGetLastError()));
}
Notes:
-
copy3DMemToHost() is the most generic of the three functions, but t shouldn’t be difficult to do the same with other two. I have hard-coded some values in the other two.
-
In my experience, setting a copy position (offset) on the host pitched ptr (host memory) never worked. So, I set the correct offset using pointer arithmetic myself and it works. Does anyone know why? Setting offsets on device memory and arrays always works.
-
In case of normal 3D memory on device and host, always set the first argument of cudaPos, cudaExtent to the offset along X in “bytes”. While, with arrays, this has to be in number of elements in X direction. Anyway, this is documented in the API guide.
Most of the code is trivial and taken from cuda examples.
Hope it helps.
Cheers,
Ojaswa