Dear forum members,
I am currently creating a fast CUDA RabbitCT implementation, which performs backprojection on multiple input images to create a 3D output model.
I am trying to create a pipelined kernel is launched on multiple streams. For this reason, I need to define textures for each stream and for each pipeline stage. However, I seem to be unable to get a working version of this code using multidimensional arrays. I have however achieved good results using a hardcoded implementation with the code below. Notice the hardcoded object parameters ( imgTexObj[streamnr][1] ).
#define STREAMS_MAX 4
#define PIPELINE_DEPTH 2
cudaStream_t streams[STREAMS_MAX];
cudaArray_t imgArray[STREAMS_MAX][PIPELINE_DEPTH];
cudaTextureObject_t imgTexObj[STREAMS_MAX][PIPELINE_DEPTH];
// Called before algorithm is launched
void initialize()
{
// Specify texture descriptor
struct cudaTextureDesc texDesc;
memset(&texDesc, 0, sizeof(texDesc));
texDesc.addressMode[0] = cudaAddressModeBorder;
texDesc.addressMode[1] = cudaAddressModeBorder;
texDesc.filterMode = cudaFilterModeLinear;
texDesc.readMode = cudaReadModeElementType;
texDesc.normalizedCoords = false;
for( int s = 0; s < STREAMS_MAX; s++ ){
gpuErrchk(cudaStreamCreateWithFlags (&streams[s], cudaStreamNonBlocking));
for( int p = 0; p < PIPELINE_DEPTH; p++){
gpuErrchk(cudaMallocArray(&imgArray[s][p], &channelDesc, pixelsX, pixelsY, cudaArrayTextureGather));
// Specify texture resource descriptor
struct cudaResourceDesc resDesc;
memset(&resDesc, 0, sizeof(resDesc));
resDesc.resType = cudaResourceTypeArray;
resDesc.res.array.array = imgArray[s][p];
gpuErrchk(cudaCreateTextureObject(&imgTexObj[s][p], &resDesc, &texDesc, NULL));
}
}
}
__global__ void backproject(const RabbitCtKernelData_t * __restrict__ r, float* __restrict__ volume,
const float* __restrict__ matrix, const cudaTextureObject_t __restrict__ TexObj,
const cudaTextureObject_t __restrict__ TexObj2)
{
CalculateMatrixStuff();
for each volume position{
volume += matrixstuff * tex2D<float>(TexObj, i+0.5f, j+0.5f);
volume += matrixstuff * tex2D<float>(TexObj2, i+0.5f, j+0.5f);
}
{
// Called for each input image
bool ExecAlgorithm(RabbitCtGlobalData* r)
{
const uint streamnr = streamcnt % STREAMS_MAX;
const uint imgnr = r->adv_projNumber % PIPELINE_DEPTH;
// Copy input image to device memory
const int inSize = r->S_x * r->S_y * sizeof(float);
gpuErrchk(cudaMemcpyToArrayAsync(imgArray[streamnr][imgnr], 0, 0, r->Input, inSize,
cudaMemcpyHostToDevice, streams[streamnr]));
// Call kernel only when all input images have been copied
if ( imgnr == PIPELINE_DEPTH-1) {
dim3 threadsPerBlock( 16, 16 );
dim3 numBlocks( r->L/threadsPerBlock.x, r->L/threadsPerBlock.y );
backproject<<<numBlocks, threadsPerBlock, 0, streams[streamnr]>>>(r, outputptr, matrixdata,
imgTexObj[streamnr][0], imgTexObj[streamnr][1]);
gpuErrchk( cudaPeekAtLastError() );
}
Ideally, I want the textures to be passed using a pointer to the texture objects, allowing the kernel to select its data based upon the PIPELINE_DEPTH parameter. And relieving me from creating multiple hardcoded implementations for benchmarking comparisons. So, the ideal code looks something like this:
__global__void backproject(const stuff, const cudaTextureObject_t __restrict__ TexObj)
{
CalculateMatrixStuff();
for each volume position{
for( stage = 0; stage < PIPELINE_DEPTH; stage++ ){
volume += matrixstuff * tex2D[stage]<float>(TexObj, i+0.5f, j+0.5f);
}
}
}
bool ExecAlgorithm
{
backproject<<<numBlocks, threadsPerBlock, 0, streams[streamnr]>>>(r, outputptr, matrixdata,
imgTexObj[streamnr]);
}
Where only the texture objects for a single stream are passed as each kernel operates in only a single stream. Passing the entire texture array is also not a problem.
I have tried several options, but I always end up with either a segfault at my mallocs for the array or an illegal memory access at line 57 from the current implementation example.
Could anybody help me get a good implementation of this global multidimensional texture object array?
Kind regards,
Remy