unspecified launch failure

When I try to use CUDASIFT I come across with this problem. Unfortunately, just do not know where is the problem and how wo find it.

gk20a gpu.0:gk20a_fifo_set_ctx_mmu_error_tsg:TSG 0 generated a mmu fault
gk20a gpu.0:gk20a_set_error_notifier:error notifier set to 31 for ch 502
safeCall() Runtime API error in file cudaSiftH.cu line258 :unspecified launch failure. gk20a gpu.0:gk20a_set_error_notifier
fifo_error_isr:channel reset initiated from fifo_error_isr;

the source code:
double ScaleDown(CudaImage &res, CudaImage &src, float variance)
{
if (res.d_data==NULL || src.d_data==NULL) {
printf(“ScaleDown: missing data\n”);
return 0.0;
}
float h_Kernel[5];
float kernelSum = 0.0f;
for (int j=0;j<5;j++)
{
h_Kernel[j] = (float)expf(-(double)(j-2)*(j-2)/2.0/variance);
kernelSum += h_Kernel[j];
}

for (int j=0;j<5;j++)
h_Kernel[j] = (float)(h_Kernel[j]/kernelSum);

safeCall(cudaMemcpyToSymbol(d_Kernel, h_Kernel, 5*sizeof(float)));
dim3 blocks(iDivUp(src.width, SCALEDOWN_W), iDivUp(src.height, SCALEDOWN_H));
dim3 threads(SCALEDOWN_W + WARP_SIZE + 2);
ScaleDown<<<blocks, threads>>>(res.d_data, src.d_data, src.width, src.pitch, src.height, res.pitch);
checkMsg(“ScaleDown() execution failed\n”);
safeCall(cudaDeviceSynchronize());
return 0.0;
}

global void ScaleDown(float * restrict d_Result, float * restrict d_Data, int width, int pitch, int height, int newpitch)
{
shared float inrow[SCALEDOWN_W+4];
shared float brow[5*(SCALEDOWN_W/2)];
shared int yRead[SCALEDOWN_H+4], yWrite[SCALEDOWN_H+4];
#define dx2 (SCALEDOWN_W/2)
const int tx = threadIdx.x;
const int tx0 = tx + 0dx2;
const int tx1 = tx + 1
dx2;
const int tx2 = tx + 2dx2;
const int tx3 = tx + 3
dx2;
const int tx4 = tx + 4dx2;
const int xStart = blockIdx.x
SCALEDOWN_W;
const int yStart = blockIdx.ySCALEDOWN_H;
const int xWrite = xStart/2 + tx;
const float k = d_Kernel;
if (tx<SCALEDOWN_H+4) {
int y = yStart + tx - 1; //every threads have one y
y = (y<0 ? 0 : y);
y = (y>=height ? height-1 : y);
yRead[tx] = y
pitch; //(SCALEDOWN_H+4) threads put y
pitch into yread
yWrite[tx] = (yStart + tx - 4)/2 * newpitch; //r=2,thus here is (tx-4),y is useless after this
}
__syncthreads();
int xRead = xStart + tx - WARP_SIZE; //the later work is based on xRead
xRead = (xRead<0 ? 0 : xRead);
xRead = (xRead>=width ? width-1 : xRead);
for (int dy=0;dy<SCALEDOWN_H+4;dy+=5) {
if (tx>=WARP_SIZE-2) //tx>=14
inrow[tx-WARP_SIZE+2] = d_Data[yRead[dy+0] + xRead]; //put block’s row.dy+0(0,5,10,15)(164 elements in every rows) into all block’s inrowtx
__syncthreads();
if (tx<dx2) //tx<80
brow[tx0] = k[0](inrow[2tx]+inrow[2tx+4]) + k[1](inrow[2tx+1]+inrow[2tx+3]) + k[2]inrow[2tx+2]; //compute browtx by inrow(164 elements)
__syncthreads();
if (tx<dx2 && dy>=4 && !(dy&1)) //tx<80, dy>=4, dy is severls times of 2
d_Result[yWrite[dy+0] + xWrite] = k[2]brow[tx2] + k[0](brow[tx0]+brow[tx4]) + k[1](brow[tx1]+brow[tx3]); //compute data by cols, write into result
if (dy<(SCALEDOWN_H+3)) { //dy < 19
if (tx>=WARP_SIZE-2) //tx>=14
inrow[tx-WARP_SIZE+2] = d_Data[yRead[dy+1] + xRead]; //put block’s row.dy+1(1,6,11,16)(164 elements in every rows) into all block’s inrowtx
__syncthreads();
if (tx<dx2) //tx<80
brow[tx1] = k[0]
(inrow[2tx]+inrow[2tx+4]) + k[1](inrow[2tx+1]+inrow[2tx+3]) + k[2]inrow[2tx+2];
__syncthreads();
if (tx<dx2 && dy>=3 && (dy&1)) //tx<80, dy>=3, dy isn’t severls times of 2
d_Result[yWrite[dy+1] + xWrite] = k[2]brow[tx3] + k[0](brow[tx1]+brow[tx0]) + k[1]
(brow[tx2]+brow[tx4]);
}
if (dy<(SCALEDOWN_H+2)) {
if (tx>=WARP_SIZE-2)
inrow[tx-WARP_SIZE+2] = d_Data[yRead[dy+2] + xRead];
__syncthreads();
if (tx<dx2)
brow[tx2] = k[0](inrow[2tx]+inrow[2tx+4]) + k[1](inrow[2tx+1]+inrow[2tx+3]) + k[2]inrow[2tx+2];
__syncthreads();
if (tx<dx2 && dy>=2 && !(dy&1)) //tx<80, dy>=2, dy is severls times of 2
d_Result[yWrite[dy+2] + xWrite] = k[2]brow[tx4] + k[0](brow[tx2]+brow[tx1]) + k[1](brow[tx3]+brow[tx0]);
}
if (dy<(SCALEDOWN_H+1)) {
if (tx>=WARP_SIZE-2)
inrow[tx-WARP_SIZE+2] = d_Data[yRead[dy+3] + xRead];
__syncthreads();
if (tx<dx2)
brow[tx3] = k[0]
(inrow[2tx]+inrow[2tx+4]) + k[1](inrow[2tx+1]+inrow[2tx+3]) + k[2]inrow[2tx+2];
__syncthreads();
if (tx<dx2 && dy>=1 && (dy&1)) //tx<80, dy>=1, dy isn’t severls times of 2
d_Result[yWrite[dy+3] + xWrite] = k[2]brow[tx0] + k[0](brow[tx3]+brow[tx2]) + k[1]
(brow[tx4]+brow[tx1]);
}
if (dy<SCALEDOWN_H) {
if (tx>=WARP_SIZE-2)
inrow[tx-WARP_SIZE+2] = d_Data[yRead[dy+4] + xRead];
__syncthreads();
if (tx<dx2)
brow[tx4] = k[0](inrow[2tx]+inrow[2tx+4]) + k[1](inrow[2tx+1]+inrow[2tx+3]) + k[2]inrow[2tx+2];
__syncthreads();
if (tx<dx2 && !(dy&1)) //tx<80, dy>=0, dy is severls times of 2
d_Result[yWrite[dy+4] + xWrite] = k[2]brow[tx1] + k[0](brow[tx4]+brow[tx3]) + k[1]*(brow[tx0]+brow[tx2]);
}
__syncthreads();
}
}

You appear to be running on a Jetson TK1

You may want to investigate whether you are hitting a kernel timeout due to display watchdog:

http://nvidia.custhelp.com/app/answers/detail/a_id/3029/~/using-cuda-and-x

I don’t remember whether cuda-memcheck is functional on that platform, I think it should be, you could try running the application with cuda-memcheck, to see if you get more information about the unspecified launch failure.