CUDA-MEMCHECK is used to check my application which is run on TX1.After its run the output file shows:
========= Invalid global write of size 4
========= at 0x000005a8 in cudaSiftD.cu:69:ScaleDown(float*, float*, int, int, int, int)
========= by thread (0,0,0) in block (1,33,0)
========= Address 0x1ed17140 is out of bounds
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame:/usr/lib/arm-linux-gnueabihf/tegra/libcuda.so.1 (cuLaunchKernel + 0x228) [0xdf7b8]
========= Host Frame:cuda_sift [0x11c48]
========= Host Frame:cuda_sift [0x5195c]
========= Host Frame:cuda_sift [0xc72e]
========= Host Frame:cuda_sift (_Z34__device_stub__Z9ScaleDownPfS_iiiiPfS_iiii + 0xed) [0xb942]
========= Host Frame:cuda_sift (_Z9ScaleDownPfS_iiii + 0x21) [0xb96a]
========= Host Frame:cuda_sift (_Z9ScaleDownR9CudaImageS0_f + 0x1a9) [0xa956]
========= Host Frame:cuda_sift (_Z11ExtractSiftfR8SiftDataR9CudaImageidfff + 0x99) [0x99be]
========= Host Frame:cuda_sift (_Z11ExtractSiftfR8SiftDataR9CudaImageidfff + 0xff) [0x9a24]
========= Host Frame:cuda_sift (_Z4siftfRN2cv3MatES1_Pf + 0x153) [0x8a68]
========= Host Frame:cuda_sift (_Z12pthread_mainPv + 0x2d9) [0x5f2a]
========= Program hit cudaErrorLaunchFailure (error 4) due to “unspecified launch failure” on CUDA API call to cudaThreadSynchronize.
========= Saved host backtrace up to driver entry point at error
========= Host Frame:/usr/lib/arm-linux-gnueabihf/tegra/libcuda.so.1 [0x29791c]
========= Host Frame:cuda_sift [0x405b8]
========= Host Frame:cuda_sift (_Z9ScaleDownR9CudaImageS0_f + 0x1c3) [0xa970]
========= Host Frame:cuda_sift (_Z11ExtractSiftfR8SiftDataR9CudaImageidfff + 0x99) [0x99be]
========= Host Frame:cuda_sift (_Z11ExtractSiftfR8SiftDataR9CudaImageidfff + 0xff) [0x9a24]
========= Host Frame:cuda_sift (_Z4siftfRN2cv3MatES1_Pf + 0x153) [0x8a68]
========= Host Frame:cuda_sift (_Z12pthread_mainPv + 0x2d9) [0x5f2a]
The cudaSiftD.cu::ScaleDown(float*, float*, int, int, int, int) was showed as following:
global void ScaleDown(float * restrict d_Result, float * restrict d_Data, int width, int pitch, int height, int newpitch)
{
shared float inrow[SCALEDOWN_W+4];
shared float brow[5*(SCALEDOWN_W/2)];
shared int yRead[SCALEDOWN_H+4], yWrite[SCALEDOWN_H+4];
#define dx2 (SCALEDOWN_W/2)
const int tx = threadIdx.x;
const int tx0 = tx + 0dx2;
const int tx1 = tx + 1dx2;
const int tx2 = tx + 2dx2;
const int tx3 = tx + 3dx2;
const int tx4 = tx + 4dx2;
const int xStart = blockIdx.xSCALEDOWN_W;
const int yStart = blockIdx.ySCALEDOWN_H;
const int xWrite = xStart/2 + tx;
const float k = d_Kernel;
if (tx<SCALEDOWN_H+4) {
int y = yStart + tx - 1; //every threads have one y
y = (y<0 ? 0 : y);
y = (y>=height ? height-1 : y);
yRead[tx] = ypitch; //(SCALEDOWN_H+4) threads put ypitch into yread
yWrite[tx] = (yStart + tx - 4)/2 * newpitch; //r=2,thus here is (tx-4),y is useless after this
}
__syncthreads();
int xRead = xStart + tx - WARP_SIZE; //the later work is based on xRead
xRead = (xRead<0 ? 0 : xRead);
xRead = (xRead>=width ? width-1 : xRead);
for (int dy=0;dy<SCALEDOWN_H+4;dy+=5) {
if (tx>=WARP_SIZE-2) //tx>=14
inrow[tx-WARP_SIZE+2] = d_Data[yRead[dy+0] + xRead]; //put block’s row.dy+0(0,5,10,15)(164 elements in every rows) into all block’s inrowtx
__syncthreads();
if (tx<dx2) //tx<80
brow[tx0] = k[0](inrow[2tx]+inrow[2tx+4]) + k[1](inrow[2tx+1]+inrow[2tx+3]) + k[2]inrow[2tx+2]; //compute browtx by inrow(164 elements)
__syncthreads();
if (tx<dx2 && dy>=4 && !(dy&1)) //tx<80, dy>=4, dy is severls times of 2
d_Result[yWrite[dy+0] + xWrite] = k[2]brow[tx2] + k[0](brow[tx0]+brow[tx4]) + k[1](brow[tx1]+brow[tx3]); //compute data by cols, write into result
if (dy<(SCALEDOWN_H+3)) { //dy < 19
if (tx>=WARP_SIZE-2) //tx>=14
inrow[tx-WARP_SIZE+2] = d_Data[yRead[dy+1] + xRead]; //put block’s row.dy+1(1,6,11,16)(164 elements in every rows) into all block’s inrowtx
__syncthreads();
if (tx<dx2) //tx<80
brow[tx1] = k[0](inrow[2tx]+inrow[2tx+4]) + k[1](inrow[2tx+1]+inrow[2tx+3]) + k[2]inrow[2tx+2];
__syncthreads();
if (tx<dx2 && dy>=3 && (dy&1)) //tx<80, dy>=3, dy isn’t severls times of 2
d_Result[yWrite[dy+1] + xWrite] = k[2]brow[tx3] + k[0](brow[tx1]+brow[tx0]) + k[1](brow[tx2]+brow[tx4]);
}
if (dy<(SCALEDOWN_H+2)) {
if (tx>=WARP_SIZE-2)
inrow[tx-WARP_SIZE+2] = d_Data[yRead[dy+2] + xRead];
__syncthreads();
if (tx<dx2)
brow[tx2] = k[0](inrow[2tx]+inrow[2tx+4]) + k[1](inrow[2tx+1]+inrow[2tx+3]) + k[2]inrow[2tx+2];
__syncthreads();
if (tx<dx2 && dy>=2 && !(dy&1)) //tx<80, dy>=2, dy is severls times of 2
d_Result[yWrite[dy+2] + xWrite] = k[2]brow[tx4] + k[0](brow[tx2]+brow[tx1]) + k[1](brow[tx3]+brow[tx0]);
}
if (dy<(SCALEDOWN_H+1)) {
if (tx>=WARP_SIZE-2)
inrow[tx-WARP_SIZE+2] = d_Data[yRead[dy+3] + xRead];
__syncthreads();
if (tx<dx2)
brow[tx3] = k[0](inrow[2tx]+inrow[2tx+4]) + k[1](inrow[2tx+1]+inrow[2tx+3]) + k[2]inrow[2tx+2];
__syncthreads();
if (tx<dx2 && dy>=1 && (dy&1)) //tx<80, dy>=1, dy isn’t severls times of 2
d_Result[yWrite[dy+3] + xWrite] = k[2]brow[tx0] + k[0](brow[tx3]+brow[tx2]) + k[1](brow[tx4]+brow[tx1]);
}
if (dy<SCALEDOWN_H) {
if (tx>=WARP_SIZE-2)
inrow[tx-WARP_SIZE+2] = d_Data[yRead[dy+4] + xRead];
__syncthreads();
if (tx<dx2)
brow[tx4] = k[0](inrow[2tx]+inrow[2tx+4]) + k[1](inrow[2tx+1]+inrow[2tx+3]) + k[2]inrow[2tx+2];
__syncthreads();
if (tx<dx2 && !(dy&1)) //tx<80, dy>=0, dy is severls times of 2
d_Result[yWrite[dy+4] + xWrite] = k[2]brow[tx1] + k[0](brow[tx4]+brow[tx3]) + k[1]*(brow[tx0]+brow[tx2]);
}
__syncthreads();
}
}
the 69 line is:
d_Result[yWrite[dy+1] + xWrite] = k[2]brow[tx3] + k[0](brow[tx1]+brow[tx0]) + k[1]*(brow[tx2]+brow[tx4]);
This problem causes my application to crash immediately but sometime it can still run for a long time.