Hi i have some problems with mapping a texture to a 3D Array when using multiple GPUs:
These are the global variables how they are defined in my kernel.cu file:
[codebox]cudaArray *d_brickArray[BRICK_COUNT] = {0};
texture<VolumeType, 3, cudaReadModeNormalizedFloat> tex;[/codebox]
This is the host function i use to copy data to the array:
[codebox]extern “C”
void copyBrickDataToCuda(void *h_brick, cudaExtent brickSize, int brickNumber)
{
int id;
cutilSafeCall(cudaGetDevice(&id));
printf("Copy brick %d on device %d\n", brickNumber, id);
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<VolumeType>();
cutilSafeCall( cudaMalloc3DArray(&d_brickArray[brickNumber], &channelDesc, brickSize) );
// copy data to 3D array
cudaMemcpy3DParms copyParams = {0};
copyParams.srcPtr = make_cudaPitchedPtr(h_brick, brickSize.width*sizeof(VolumeType), brickSize.depth, brickSize.height);
copyParams.dstArray = d_brickArray[brickNumber];
copyParams.extent = brickSize;
copyParams.kind = cudaMemcpyHostToDevice;
cutilSafeCall( cudaMemcpy3D(©Params) );
}[/codebox]
And with this function i map the texture to the array:
[codebox]extern “C”
void mapTextureToArray(int brickNumber)
{
int id;
cutilSafeCall(cudaGetDevice(&id));
printf("Mapping brick %d on device %d\n", brickNumber, id);
// bind array to 3D texture
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<VolumeType>();
cutilSafeCall(cudaBindTextureToArray(tex, d_brickArray[brickNumber], channelDesc));
}[/codebox]
now in my main cpp i have a parallel section like this:
[codebox]omp_set_dynamic(0);
omp_set_num_threads(gpuCount);
//////////////////////////////////////////////////////////////////////////
// distribute bricks across gpus
int numberOfBricksPerGPU = NUMBEROFBRICKS / gpuCount;
#pragma omp parallel
{
uint cpu_thread_id = omp_get_thread_num();
if(cpu_thread_id != 0)
{
cutilSafeCall(cudaGLSetGLDevice(cpu_thread_id));
}
:
:
printf("Thread %d: loading brick nr %d to Device %d\n", cpu_thread_id, i, threadDeviceId);
copyBrickDataToCuda(h_brick, brickSize, i);
:
mapTextureToArray(brickIndex);
callKernel<<<>>>
cutilCheckMsg("kernel failed");
}[/codebox]
if i set the number of threads to use to 0 it works. Furthermore, if using multiple threads, the first thread
(the one which uses the device that was set before) runs without problems. All other threads give error message
in line where the cutilCheckMsg call is placed saying, “kernel failed: invalid resource handle”
So i decided to test it with a call to a completely empty kernel. but with the same result. Even the empty kernel says: “kernel failed: invalid resource handle”
i have been able to narrow the problem down to the call of the “mapTextureToArray()” function. (i think) because when i don’t call this function the empty
kernel runs without errors.
So the question is, how are the global variables handled when using multiple threads to run multiple gpus at once. As far as i know, the API should in a way
copy the hole context for each thread so that every host thread has access to the same resources (correct me please if this is wrong)
I just can’t the the problem in here.
I’m running this on a Win 7 x64 System. Available is on Quadro FX5800 and two Tesla C1060.
Driver Version is 260.61 and Cuda Toolkit is of Version 3.2
Best regards
Tobi