Hi,
I work for a company that is developing a commercial product based on CUDA. However, we have run into
some stability problems. We are using version 2.3 of the CUDA Toolkit and the driver API.
The problem is that our CUDA kernels fail irregularly.
To reproduce the problem we have reduced the code to a loop that does nothing besides calling cuLauchGrid()
and cuCtxSynchronize(). The code can run for thousands of iterations without problems before it suddenly fails.
On Windows XP the usual symptom is that cuCtxSynchronize() returns 700 CUDA_ERROR_LAUNCH_FAILED.
On Windows 7 the usual symptom is that cuCtxSynchronize() returns 999 CUDA_ERROR_UNKNOWN.
The failure appears more or less random, but seem to occur more often on some computers than on others.
Once the error has occured, all subsequent calls fail with the same error code.
The execution time for the kernel is less than a millisecond per iteration.
So it is nowhere near the five second timeout.
Sometimes the cuda driver dies instead of returning an error code. When that happens the computer has to be rebooted.
I need help to solve this problem. Any insight you may have, might be helpful.
The launching code looks like this:
void CUDAInterface::CorrelationX(cudaDeviceBufferPointer inbuf, cudaDeviceBufferPointer outbuf,
const float * const kernel, int numCoefficients,
int sizeX, int sizeY, int sizeZ,
int rowPitch, int slicePitch)
{
CUresult cudaErr = cuCtxPushCurrent(_cudaContext);
if (cudaErr != CUDA_SUCCESS)
printf("cuCtxPushCurrent(%p) returned %d.\n",_cudaContext,cudaErr);
CUfunction kernelFunction;
cudaErr = cuModuleGetFunction(&kernelFunction, _cudaCorrelationKernels, MAKE_STRING(RowCorrelationKernel));
if (cudaErr != CUDA_SUCCESS)
printf("cuModuleGetFunction(%p,%p,%s) returned %d.\n",&kernelFunction,_cudaCorrelationKernels,MAKE_STRING(RowCorrelationKernel),cudaErr);
const int blockSizeX = 32;
const int blockSizeY = 8;
cudaErr = cuFuncSetBlockShape(kernelFunction, blockSizeX, blockSizeY, 1);
if (cudaErr != CUDA_SUCCESS)
printf("cuFuncSetBlockShape(%p,%d,%d,1) returned %d.\n",kernelFunction,blockSizeX,blockSizeY,cudaErr);
int parameterOffset = 0;
AlignParameterOffset(parameterOffset, __alignof(void *));
cudaErr = cuParamSetv(kernelFunction, parameterOffset, &inbuf, sizeof(inbuf));
if (cudaErr != CUDA_SUCCESS)
printf("cuParamSetv(%p,%d,%p,%d) returned %d.\n",kernelFunction,parameterOffset,&inbuf,sizeof(inbuf),cudaErr);
parameterOffset+=sizeof(inbuf);
AlignParameterOffset(parameterOffset, __alignof(void *));
cudaErr = cuParamSetv(kernelFunction, parameterOffset, &outbuf, sizeof(outbuf));
if (cudaErr != CUDA_SUCCESS)
printf("cuParamSetv(%p,%d,%p,%d) returned %d.\n",kernelFunction,parameterOffset,&outbuf,sizeof(outbuf),cudaErr);
parameterOffset+=sizeof(outbuf);
AlignParameterOffset(parameterOffset, __alignof(int));
cudaErr = cuParamSetv(kernelFunction, parameterOffset, &sizeX, sizeof(sizeX));
if (cudaErr != CUDA_SUCCESS)
printf("cuParamSetv(%p,%d,%p,%d) returned %d.\n",kernelFunction,parameterOffset,&sizeX,sizeof(sizeX),cudaErr);
parameterOffset+=sizeof(sizeX);
AlignParameterOffset(parameterOffset, __alignof(int));
cudaErr = cuParamSetv(kernelFunction, parameterOffset, &sizeY, sizeof(sizeY));
if (cudaErr != CUDA_SUCCESS)
printf("cuParamSetv(%p,%d,%p,%d) returned %d.\n",kernelFunction,parameterOffset,&sizeY,sizeof(sizeY),cudaErr);
parameterOffset+=sizeof(sizeY);
AlignParameterOffset(parameterOffset, __alignof(int));
cudaErr = cuParamSetv(kernelFunction, parameterOffset, &sizeZ, sizeof(sizeZ));
if (cudaErr != CUDA_SUCCESS)
printf("cuParamSetv(%p,%d,%p,%d) returned %d.\n",kernelFunction,parameterOffset,&sizeZ,sizeof(sizeZ),cudaErr);
parameterOffset+=sizeof(sizeZ);
int kernelRadius = (numCoefficients - 1) / 2;
AlignParameterOffset(parameterOffset, __alignof(int));
cudaErr = cuParamSetv(kernelFunction, parameterOffset, &kernelRadius, sizeof(kernelRadius));
if (cudaErr != CUDA_SUCCESS)
printf("cuParamSetv(%p,%d,%p,%d) returned %d.\n",kernelFunction,parameterOffset,&kernelRadius,sizeof(kernelRadius),cudaErr);
parameterOffset+=sizeof(kernelRadius);
AlignParameterOffset(parameterOffset, __alignof(int));
cudaErr = cuParamSetv(kernelFunction, parameterOffset, &rowPitch, sizeof(rowPitch));
if (cudaErr != CUDA_SUCCESS)
printf("cuParamSetv(%p,%d,%p,%d) returned %d.\n",kernelFunction,parameterOffset,&rowPitch,sizeof(rowPitch),cudaErr);
parameterOffset+=sizeof(rowPitch);
AlignParameterOffset(parameterOffset, __alignof(int));
cudaErr = cuParamSetv(kernelFunction, parameterOffset, &slicePitch, sizeof(slicePitch));
if (cudaErr != CUDA_SUCCESS)
printf("cuParamSetv(%p,%d,%p,%d) returned %d.\n",kernelFunction,parameterOffset,&slicePitch,sizeof(slicePitch),cudaErr);
parameterOffset+=sizeof(slicePitch);
cudaErr = cuParamSetSize(kernelFunction, parameterOffset);
if (cudaErr != CUDA_SUCCESS)
printf("cuParamSetSize(%p,%d) returned %d.\n",kernelFunction,parameterOffset,cudaErr);
unsigned int kernelSizeInBytes = numCoefficients * sizeof(float);
SetKernelGlobalParameter(_cudaCorrelationKernels, MAKE_STRING(filterKernelX), kernel, kernelSizeInBytes);
CUdeviceptr devPtr;
cudaErr = cuModuleGetGlobal(&devPtr, NULL, _cudaCorrelationKernels, MAKE_STRING(filterKernelX));
if (cudaErr != CUDA_SUCCESS)
printf("cuModuleGetGlobal(%p,NULL,%p,%s) returned %d.\n",&devPtr, _cudaCorrelationKernels, MAKE_STRING(filterKernelX),cudaErr);
cudaErr = cuMemcpyHtoD(devPtr, kernel, kernelSizeInBytes);
if (cudaErr != CUDA_SUCCESS)
printf("cuMemcpyHtoD(%p,%p,%d) returned %d.\n",&devPtr, kernel, kernelSizeInBytes,cudaErr);
printf("RowCorrelationKernel(%p,%p,%d%,%d,%d,%d,%d,%d)\n",inbuf,outbuf,sizeX,sizeY,sizeZ,kernelRadius,rowPitch,sl
icePitch);
int count = 0;
int errors = 0;
do {
if((count % 1000) == 0)
printf("%d iterations, %d errors.\n",count,errors);
int blockX = sizeZ;
int blockY = (sizeY + blockSizeY - 1) / blockSizeY;
cudaErr = cuLaunchGrid(kernelFunction, blockX, blockY);
if (cudaErr != CUDA_SUCCESS) {
printf("cuLaunchGrid(%d,%d) returned %d.\n",blockX,blockY,cudaErr);
errors++;
}
cudaErr = cuCtxSynchronize();
if (cudaErr != CUDA_SUCCESS) {
printf("cuCtxSynchronize() returned %d.\n",cudaErr);
errors++;
}
count++;
} while(true);
cudaErr = cuCtxPopCurrent(NULL);
if (cudaErr != CUDA_SUCCESS)
printf("cuCtxPopCurrent(NULL) returned %d.\n",cudaErr);
}
The kernel itself looks like this:
#define MAX_KERNEL_WIDTH 15
__device__ __constant__ float filterKernelX[MAX_KERNEL_WIDTH];
#define OVERLAP_OFFSET 16
#define UNROLL_X_3_2 \
data[threadIdx.y][OVERLAP_OFFSET + threadIdx.x - 1] * filterKernelX[0] + \
data[threadIdx.y][OVERLAP_OFFSET + threadIdx.x ] * filterKernelX[1] + \
data[threadIdx.y][OVERLAP_OFFSET + threadIdx.x + 1] * filterKernelX[2];
#define UNROLL_X_5_2 \
data[threadIdx.y][OVERLAP_OFFSET + threadIdx.x - 2] * filterKernelX[0] + \
data[threadIdx.y][OVERLAP_OFFSET + threadIdx.x - 1] * filterKernelX[1] + \
data[threadIdx.y][OVERLAP_OFFSET + threadIdx.x ] * filterKernelX[2] + \
data[threadIdx.y][OVERLAP_OFFSET + threadIdx.x + 1] * filterKernelX[3] + \
data[threadIdx.y][OVERLAP_OFFSET + threadIdx.x + 2] * filterKernelX[4];
#define UNROLL_X_7_2 \
data[threadIdx.y][OVERLAP_OFFSET + threadIdx.x - 3] * filterKernelX[0] + \
data[threadIdx.y][OVERLAP_OFFSET + threadIdx.x - 2] * filterKernelX[1] + \
data[threadIdx.y][OVERLAP_OFFSET + threadIdx.x - 1] * filterKernelX[2] + \
data[threadIdx.y][OVERLAP_OFFSET + threadIdx.x ] * filterKernelX[3] + \
data[threadIdx.y][OVERLAP_OFFSET + threadIdx.x + 1] * filterKernelX[4] + \
data[threadIdx.y][OVERLAP_OFFSET + threadIdx.x + 2] * filterKernelX[5] + \
data[threadIdx.y][OVERLAP_OFFSET + threadIdx.x + 3] * filterKernelX[6];
#define UNROLL_X_9_2 \
data[threadIdx.y][OVERLAP_OFFSET + threadIdx.x - 4] * filterKernelX[0] + \
data[threadIdx.y][OVERLAP_OFFSET + threadIdx.x - 3] * filterKernelX[1] + \
data[threadIdx.y][OVERLAP_OFFSET + threadIdx.x - 2] * filterKernelX[2] + \
data[threadIdx.y][OVERLAP_OFFSET + threadIdx.x - 1] * filterKernelX[3] + \
data[threadIdx.y][OVERLAP_OFFSET + threadIdx.x ] * filterKernelX[4] + \
data[threadIdx.y][OVERLAP_OFFSET + threadIdx.x + 1] * filterKernelX[5] + \
data[threadIdx.y][OVERLAP_OFFSET + threadIdx.x + 2] * filterKernelX[6] + \
data[threadIdx.y][OVERLAP_OFFSET + threadIdx.x + 3] * filterKernelX[7] + \
data[threadIdx.y][OVERLAP_OFFSET + threadIdx.x + 4] * filterKernelX[8];
#define UNROLL_X_11_2 \
data[threadIdx.y][OVERLAP_OFFSET + threadIdx.x - 5] * filterKernelX[0] + \
data[threadIdx.y][OVERLAP_OFFSET + threadIdx.x - 4] * filterKernelX[1] + \
data[threadIdx.y][OVERLAP_OFFSET + threadIdx.x - 3] * filterKernelX[2] + \
data[threadIdx.y][OVERLAP_OFFSET + threadIdx.x - 2] * filterKernelX[3] + \
data[threadIdx.y][OVERLAP_OFFSET + threadIdx.x - 1] * filterKernelX[4] + \
data[threadIdx.y][OVERLAP_OFFSET + threadIdx.x ] * filterKernelX[5] + \
data[threadIdx.y][OVERLAP_OFFSET + threadIdx.x + 1] * filterKernelX[6] + \
data[threadIdx.y][OVERLAP_OFFSET + threadIdx.x + 2] * filterKernelX[7] + \
data[threadIdx.y][OVERLAP_OFFSET + threadIdx.x + 3] * filterKernelX[8] + \
data[threadIdx.y][OVERLAP_OFFSET + threadIdx.x + 4] * filterKernelX[9] + \
data[threadIdx.y][OVERLAP_OFFSET + threadIdx.x + 5] * filterKernelX[10];
#define UNROLL_X_13_2 \
data[threadIdx.y][OVERLAP_OFFSET + threadIdx.x - 6] * filterKernelX[0] + \
data[threadIdx.y][OVERLAP_OFFSET + threadIdx.x - 5] * filterKernelX[1] + \
data[threadIdx.y][OVERLAP_OFFSET + threadIdx.x - 4] * filterKernelX[2] + \
data[threadIdx.y][OVERLAP_OFFSET + threadIdx.x - 3] * filterKernelX[3] + \
data[threadIdx.y][OVERLAP_OFFSET + threadIdx.x - 2] * filterKernelX[4] + \
data[threadIdx.y][OVERLAP_OFFSET + threadIdx.x - 1] * filterKernelX[5] + \
data[threadIdx.y][OVERLAP_OFFSET + threadIdx.x ] * filterKernelX[6] + \
data[threadIdx.y][OVERLAP_OFFSET + threadIdx.x + 1] * filterKernelX[7] + \
data[threadIdx.y][OVERLAP_OFFSET + threadIdx.x + 2] * filterKernelX[8] + \
data[threadIdx.y][OVERLAP_OFFSET + threadIdx.x + 3] * filterKernelX[9] + \
data[threadIdx.y][OVERLAP_OFFSET + threadIdx.x + 4] * filterKernelX[10] + \
data[threadIdx.y][OVERLAP_OFFSET + threadIdx.x + 5] * filterKernelX[11] + \
data[threadIdx.y][OVERLAP_OFFSET + threadIdx.x + 6] * filterKernelX[12];
#define UNROLL_X_15_2 \
data[threadIdx.y][OVERLAP_OFFSET + threadIdx.x - 7] * filterKernelX[0] + \
data[threadIdx.y][OVERLAP_OFFSET + threadIdx.x - 6] * filterKernelX[1] + \
data[threadIdx.y][OVERLAP_OFFSET + threadIdx.x - 5] * filterKernelX[2] + \
data[threadIdx.y][OVERLAP_OFFSET + threadIdx.x - 4] * filterKernelX[3] + \
data[threadIdx.y][OVERLAP_OFFSET + threadIdx.x - 3] * filterKernelX[4] + \
data[threadIdx.y][OVERLAP_OFFSET + threadIdx.x - 2] * filterKernelX[5] + \
data[threadIdx.y][OVERLAP_OFFSET + threadIdx.x - 1] * filterKernelX[6] + \
data[threadIdx.y][OVERLAP_OFFSET + threadIdx.x ] * filterKernelX[7] + \
data[threadIdx.y][OVERLAP_OFFSET + threadIdx.x + 1] * filterKernelX[8] + \
data[threadIdx.y][OVERLAP_OFFSET + threadIdx.x + 2] * filterKernelX[9] + \
data[threadIdx.y][OVERLAP_OFFSET + threadIdx.x + 3] * filterKernelX[10] + \
data[threadIdx.y][OVERLAP_OFFSET + threadIdx.x + 4] * filterKernelX[11] + \
data[threadIdx.y][OVERLAP_OFFSET + threadIdx.x + 5] * filterKernelX[12] + \
data[threadIdx.y][OVERLAP_OFFSET + threadIdx.x + 6] * filterKernelX[13] + \
data[threadIdx.y][OVERLAP_OFFSET + threadIdx.x + 7] * filterKernelX[14];
extern "C"
__global__ void RowCorrelationKernel(float *inbuf, float *outbuf, int sizeX, int sizeY, int sizeZ, int kernelRadius, int rowPitch, int slicePitch)
{
__shared__ float data[8][64];
const int yIndex = blockIdx.y * blockDim.y + threadIdx.y;
const int yzPos = blockIdx.x * slicePitch + yIndex * rowPitch;
for (int x = 0; x < sizeX; x+=blockDim.x) {
int tileEnd = min(blockDim.x, sizeX - x); // Number of valid columns for iteration
// Read data
if (yIndex < sizeY && x + threadIdx.x < sizeX) {
data[threadIdx.y][OVERLAP_OFFSET + threadIdx.x] = inbuf[yzPos + x + threadIdx.x];
}
// Pad left and right
if (yIndex < sizeY && threadIdx.x < kernelRadius) {
data[threadIdx.y][OVERLAP_OFFSET - 1 - (int) threadIdx.x] = inbuf[yzPos + max(x - 1 - (int) threadIdx.x, 0)];
data[threadIdx.y][OVERLAP_OFFSET + tileEnd + threadIdx.x] = inbuf[yzPos + min(x + tileEnd + threadIdx.x, sizeX - 1)];
}
__syncthreads();
if (yIndex < sizeY && x + threadIdx.x < sizeX) {
float sum = 0.0f;
if (kernelRadius == 7) {
sum = UNROLL_X_15_2;
}
else if (kernelRadius == 6) {
sum = UNROLL_X_13_2;
}
else if (kernelRadius == 5) {
sum = UNROLL_X_11_2;
}
else if (kernelRadius == 4) {
sum = UNROLL_X_9_2;
}
else if (kernelRadius == 3) {
sum = UNROLL_X_7_2;
}
else if (kernelRadius == 2) {
sum = UNROLL_X_5_2;
}
else if (kernelRadius == 1) {
sum = UNROLL_X_3_2;
}
outbuf[yzPos + x + threadIdx.x] = sum;
}
__syncthreads();
}
}
NVIDIA systeminfo from one of the computers where the problem occur frequently:
[Bildskärm]
Processor: Intel® Core™2 Quad CPU Q9450 @ 2.66GHz (2659 MHz)
Operativsystem: Microsoft Windows XP, 32-bit (Service Pack 3)
DirectX-version: 9.0c
GPU-enhetens processor: Quadro NVS 290
Drivrutinsversion: 190.38
Streamprocessorer: 16
Kärnklocka: 459 MHz
Shaderklocka: 918 MHz
Minnesklocka: 400 MHz (800 MHz datahastighet)
Minnesgränssnitt: 64 Bitar
Minne: 256 MB
Video BIOS-version: 60.86.63.00.19
IRQ: 16
Buss: PCI Express x16
[Komponenter]
nvCplUIR.dll 2.7.130.16 NVIDIA Control Panel
nvCpl.cpl 2.7.130.16 NVIDIA Control Panel Applet
nvExpBar.dll 1.5.30.42 NVIDIA Control Panel
nvCplUI.exe 2.7.130.16 NVIDIA Control Panel
nvViTvSR.dll 6.14.11.9038 NVIDIA Video and TV Server
nvViTvS.dll 6.14.11.9038 NVIDIA Video and TV Server
nvDispSR.dll 6.14.11.9038 NVIDIA Display Server
NVMCTRAY.DLL 6.14.11.9038 NVIDIA Media Center Library
NVOGLNT.DLL 6.14.11.9038 NVIDIA Compatible OpenGL ICD
nvDispS.dll 6.14.11.9038 NVIDIA Display Server
NVCPL.DLL 6.14.11.9038 NVIDIA Compatible Windows 2000 Display driver, Version 190.38
NV4_MINI.SYS 6.14.11.9038 NVIDIA Compatible Windows 2000 Miniport Driver, Version 190.38
NV4_DISP.DLL 6.14.11.9038 NVIDIA Compatible Windows 2000 Display driver, Version 190.38
nvMoblSR.dll 6.14.11.9038 NVIDIA Mobile Server
nvMoblS.dll 6.14.11.9038 NVIDIA Mobile Server
nvWSSR.dll 6.14.11.9038 NVIDIA Workstation Server
nvWSS.dll 6.14.11.9038 NVIDIA Workstation Server
PhysX 09.09.1112 NVIDIA PhysX
NVCUDA.DLL 6.14.11.9038 NVIDIA CUDA 2.3 driver
nvGameSR.dll 6.14.11.9038 NVIDIA 3D Settings Server
nvGameS.dll 6.14.11.9038 NVIDIA 3D Settings Server