JIT Compile Fails Silently

Hi,

I noticed that if JIT compilation of embedded PTX code fails (e.g. because I’m using too much shared memory), the program just does nothing at all instead of giving me an error.

Example:

nvcc -arch=compute_11 -code=sm_11 --use_fast_math test.cu -o test

ptxas /tmp/tmpxft_00000c90_00000000-2_test.ptx, line 123; warning : Double is not supported. Demoting to float

ptxas error : Entry function '_Z13kernel_sharedPfjS_S_' uses too much shared data (0x9190 bytes + 0x10 bytes system, 0x4000 max)
$ nvcc -arch=compute_11 -code=compute_11 --use_fast_math test.cu -o test

$ ./test 

[no GPU code executed]

How can I detect PTX compiling problems at runtime?

Thanks,

Nikolaus

I doesn’t fail silently. If you are using the driver API, you will get an explicit error returned when you try to load the PTX. In the runtime API, JIT compilation failure will kill the context, and any further operations on the context will produce errors which can be trapped with the usual runtime error checking.

Hmm. What would be a suitable operation to produce this error in the runtime API? I am calling cudaEventRecord() and other kernels after the failed launch, and they don’t return any errors.

Calling cudaGetLastError directly after the launch should do it. Don’t pay too much attention to the code, it is mostly nonsense, but this:

#include <stdlib.h>

#include <stdio.h>

#ifndef gpuAssert

#include <stdio.h>

#define gpuAssert( condition ) { if( (condition) != 0 ) { fprintf( stderr, "\n FAILURE, code %u : %s in %s, line %d\n", condition, cudaGetErrorString(condition), __FILE__, __LINE__ ); exit( cudaThreadExit() ); } }

#endif

#define frand(min,max) ( (min) + ((max)-(min))*(float)rand() / RAND_MAX )

#define ds7 (16384)

#define ds1 (256)

template<unsigned int bsize>

__global__ void testfunc(float *in, float *out)

{

    __shared__ float buff[bsize];

    volatile float * bpoint = &buff[0];

unsigned int spos = blockIdx.x * bsize;

for(int i = threadIdx.x; i < bsize; i+= blockDim.x)

        bpoint[i] = in[spos + i];

__syncthreads();

if (threadIdx.x < 32) {

        for(int i = 32 + threadIdx.x; i < bsize; i += 32)

            bpoint[threadIdx.x] += bpoint[i];

if (threadIdx.x < 16)

            bpoint[threadIdx.x] += bpoint[threadIdx.x+16];    

        if (threadIdx.x < 8)

            bpoint[threadIdx.x] += bpoint[threadIdx.x+8];    

        if (threadIdx.x < 4)

            bpoint[threadIdx.x] += bpoint[threadIdx.x+4];    

        if (threadIdx.x < 2)

            bpoint[threadIdx.x] += bpoint[threadIdx.x+2];    

        if (threadIdx.x == 0) 

            out[blockIdx.x] = bpoint[0] + bpoint[1];

    }

}

int main(void)

{

    const size_t dsize = 2097152;

    const dim3 blocksize = dim3(128,1,1);

float * rawdata = (float *)malloc(dsize * sizeof(float));

    for(unsigned int i=0; i<dsize; i++)

        rawdata[i] = frand(0.,1.);

float * _rawdata; 

    gpuAssert( cudaMalloc((void **)&_rawdata, dsize * sizeof(float)) );

// Legal shared memory buffer case

    size_t nblocks1 = dsize / ds1;

    dim3 gridsize1 = dim3(nblocks1, 1);

    float * _output1;

    fprintf(stdout, "Legal case starting\n"); fflush(stdout);

    gpuAssert( cudaMalloc((void **)&_output1, nblocks1 * sizeof(float)) );

    testfunc<ds1><<< gridsize1, blocksize >>>(_rawdata, _output1);

    gpuAssert( cudaGetLastError() );

    gpuAssert( cudaThreadSynchronize() );

    fprintf(stdout, "Legal case completed\n"); fflush(stdout);

// Illegal shared memory buffer case

    size_t nblocks7 = dsize / ds7;

    dim3 gridsize7 = dim3(nblocks7, 1);

    float * _output7;

    fprintf(stdout, "Illegal case starting\n"); fflush(stdout);

    gpuAssert( cudaMalloc((void **)&_output7, nblocks7 * sizeof(float)) );

    testfunc<ds7><<< gridsize7, blocksize >>>(_rawdata, _output7);

    gpuAssert( cudaGetLastError() );

    gpuAssert( cudaThreadSynchronize() );

    fprintf(stdout, "Illegal case completed\n"); fflush(stdout);

return cudaThreadExit();

}

which does this for me:

avidday@cuda:~$ nvcc -arch=compute_13 -code=compute_13 -Xptxas="-v" shmerror.cu -o shmerror

avidday@cuda:~$ ./shmerror 

Legal case starting

Legal case completed

Illegal case starting

FAILURE, code 0 : no error in shmerror.cu, line 74

The slightly unusual output from the assert is what happens when the context is killed.