x86_64, driver API, and pointers to pointers

I’m having problems with a pointer to pointer parameter in a global function on 64-bit linux. It gives me a “launch failed” error when I use it. I’m compiling PTX and JITting, and it works fine on 32-bit systems.

I’m pretty sure I’m passing and aligning parameters correctly since if I don’t call the function with pointers to pointers then everything works fine. The pointer to pointer parameter is an array of CUdeviceptr’s and I think the problem is how I build it, since if only have one CUdeviceptr in my array the problematic function actually works. Here are the relevant parts of the host code:

[codebox]

    dMatrices = (CUdeviceptr*) malloc(sizeof(CUdeviceptr) * kMatrixCount);

for (int i = 0; i < kMatrixCount; i++) {

        cuMemAlloc(&dMatrices[i], sizeof(float) * kMatrixSize * kCategoryCount);

    }

cuMemAlloc(&dPtrQueue, sizeof(CUdeviceptr) * ptrQueueLength);

    hPtrQueue = (CUdeviceptr*) malloc(sizeof(CUdeviceptr) * ptrQueueLength);

int totalCount = 0;

    for (int i = 0; i < count; i++) {        

        for (int j = 0; j < kCategoryCount; j++) {

            // *** I think the problem might be with this line below ***

            hPtrQueue[totalCount] = dMatrices[probabilityIndices[i]] + (j * kMatrixSize * sizeof(CUdeviceptr));

            totalCount++;

        }

    }

cuMemcpyHtoD(dPtrQueue, hPtrQueue, sizeof(CUdeviceptr) * totalCount);

gpu->LaunchKernel(fMatrixMulADB,

                      bgTransitionProbabilitiesBlock, bgTransitionProbabilitiesGrid,

                      parameterCountV, totalParameterCount,

                      dPtrQueue, dIevc, dEigenValues, dEvec, distanceQueue,

                      kPaddedStateCount, kPaddedStateCount, totalMatrix);

[/codebox]

the custom kernel launch function used:

[codebox]

void GPUInterface::LaunchKernel(CUfunction deviceFunction,

                            Dim3Int block,

                            Dim3Int grid,

                            int parameterCountV,

                            int totalParameterCount,

                            ...) { // parameters

cuCtxPushCurrent(cudaContext);

cuFuncSetBlockShape(deviceFunction, block.x, block.y, block.z);

int offset = 0;

va_list parameters;

va_start(parameters, totalParameterCount);  

for(int i = 0; i < parameterCountV; i++) {

    CUdeviceptr param = va_arg(parameters, CUdeviceptr);

// adjust offset alignment requirements

    offset = (offset + __alignof(void*) - 1) & ~(__alignof(void*) - 1);

cuParamSetv(deviceFunction, offset, &param, sizeof(void*));

offset += sizeof(void*);

}

for(int i = parameterCountV; i < totalParameterCount; i++) {

    unsigned int param = va_arg(parameters, unsigned int);

// adjust offset alignment requirements

    offset = (offset + __alignof(param) - 1) & ~(__alignof(param) - 1);

cuParamSeti(deviceFunction, offset, param);

offset += sizeof(param);

}

va_end(parameters);

cuParamSetSize(deviceFunction, offset);

cuLaunchGrid(deviceFunction, grid.x, grid.y);

cuCtxPopCurrent(&cudaContext);

}

[/codebox]

And a portion of the device code:

[codebox]

global void kernelMatrixMulADB(float** listC,

                               float* A,

                               float* D,

                               float* B,

                               float* distanceQueue,

                               int length,

                               int wB,

                               int totalMatrix) {

shared float* C;

int wMatrix = blockIdx.x % totalMatrix;

// Thread index

int tx = threadIdx.x;

int ty = threadIdx.y;

if (tx == 0 && ty == 0) {

    C = listC[wMatrix];

}

__syncthreads();

// (…)

C[index] = Csub;

}

[/codebox]

Any ideas what might be the problem?

Thanks.

If anyone is curious the fix was to change this bit of the kernel code:

C = listC[wMatrix];

to this:

C = (float*) *((int*)listC + wMatrix);