I’m having problems with a pointer to pointer parameter in a global function on 64-bit linux. It gives me a “launch failed” error when I use it. I’m compiling PTX and JITting, and it works fine on 32-bit systems.
I’m pretty sure I’m passing and aligning parameters correctly since if I don’t call the function with pointers to pointers then everything works fine. The pointer to pointer parameter is an array of CUdeviceptr’s and I think the problem is how I build it, since if only have one CUdeviceptr in my array the problematic function actually works. Here are the relevant parts of the host code:
[codebox]
dMatrices = (CUdeviceptr*) malloc(sizeof(CUdeviceptr) * kMatrixCount);
for (int i = 0; i < kMatrixCount; i++) {
cuMemAlloc(&dMatrices[i], sizeof(float) * kMatrixSize * kCategoryCount);
}
cuMemAlloc(&dPtrQueue, sizeof(CUdeviceptr) * ptrQueueLength);
hPtrQueue = (CUdeviceptr*) malloc(sizeof(CUdeviceptr) * ptrQueueLength);
int totalCount = 0;
for (int i = 0; i < count; i++) {
for (int j = 0; j < kCategoryCount; j++) {
// *** I think the problem might be with this line below ***
hPtrQueue[totalCount] = dMatrices[probabilityIndices[i]] + (j * kMatrixSize * sizeof(CUdeviceptr));
totalCount++;
}
}
cuMemcpyHtoD(dPtrQueue, hPtrQueue, sizeof(CUdeviceptr) * totalCount);
gpu->LaunchKernel(fMatrixMulADB,
bgTransitionProbabilitiesBlock, bgTransitionProbabilitiesGrid,
parameterCountV, totalParameterCount,
dPtrQueue, dIevc, dEigenValues, dEvec, distanceQueue,
kPaddedStateCount, kPaddedStateCount, totalMatrix);
[/codebox]
the custom kernel launch function used:
[codebox]
void GPUInterface::LaunchKernel(CUfunction deviceFunction,
Dim3Int block,
Dim3Int grid,
int parameterCountV,
int totalParameterCount,
...) { // parameters
cuCtxPushCurrent(cudaContext);
cuFuncSetBlockShape(deviceFunction, block.x, block.y, block.z);
int offset = 0;
va_list parameters;
va_start(parameters, totalParameterCount);
for(int i = 0; i < parameterCountV; i++) {
CUdeviceptr param = va_arg(parameters, CUdeviceptr);
// adjust offset alignment requirements
offset = (offset + __alignof(void*) - 1) & ~(__alignof(void*) - 1);
cuParamSetv(deviceFunction, offset, ¶m, sizeof(void*));
offset += sizeof(void*);
}
for(int i = parameterCountV; i < totalParameterCount; i++) {
unsigned int param = va_arg(parameters, unsigned int);
// adjust offset alignment requirements
offset = (offset + __alignof(param) - 1) & ~(__alignof(param) - 1);
cuParamSeti(deviceFunction, offset, param);
offset += sizeof(param);
}
va_end(parameters);
cuParamSetSize(deviceFunction, offset);
cuLaunchGrid(deviceFunction, grid.x, grid.y);
cuCtxPopCurrent(&cudaContext);
}
[/codebox]
And a portion of the device code:
[codebox]
global void kernelMatrixMulADB(float** listC,
float* A,
float* D,
float* B,
float* distanceQueue,
int length,
int wB,
int totalMatrix) {
shared float* C;
int wMatrix = blockIdx.x % totalMatrix;
// Thread index
int tx = threadIdx.x;
int ty = threadIdx.y;
if (tx == 0 && ty == 0) {
C = listC[wMatrix];
}
__syncthreads();
// (…)
C[index] = Csub;
}
[/codebox]
Any ideas what might be the problem?
Thanks.