I am trying to learn how to use shared memory for image filtering operations (e.g. Sobel 3x3), and i am having some trouble. My kernel keeps exiting on from what I can tell is an access violation (this is in EmuDebug mode), but the compiler fails to break where the exception was thrown. It gets well into a few thread blocks worth of execution before it crashes. The input is a 1024x768 floating point mono image. How do i get the compiler to break on and access violation exception in a kernel? Can anyone see some fundamental thing wrong with this code that could cause this?
#define BLOCK_SIZE 16
texture<float, 2, cudaReadModeElementType> texData;
#define IDX(p,x,y) (IMUL(p,y) + x)
global void conv_sobel_sharedmem_dx_f(float* d_data, int d_pitch, int width, int height)
{
//x y adress within the image corresponding to the thread
const int ix = IMUL(blockDim.x, blockIdx.x) + threadIdx.x - 1;
const int iy = IMUL(blockDim.y, blockIdx.y) + threadIdx.y - 1;
//texture coord into the image
const float itx = (float)ix + 0.5f;
const float ity = (float)iy + 0.5f;
if(ix >= width || iy >= height)
return;
//shared memory for the entire image bloack and the apron
__shared__ float shared_data[BLOCK_SIZE * BLOCK_SIZE];
//just for clarity
const int spitch = blockDim.x;
const int sx = threadIdx.x;
const int sy = threadIdx.y;
//load into shared memory
shared_data[IDX(spitch,sx,sy)] = tex2D(texData,itx,ity);
__syncthreads();
if((threadIdx.x == 0) || (threadIdx.x == blockDim.x-1) ||
(threadIdx.y == 0) || (threadIdx.y == blockDim.y-1))
return;
d_data[d_pitch*iy+ix] = (shared_data[IDX(spitch,sx-1,sy-1)] * -1.0 +
shared_data[IDX(spitch,sx ,sy-1)] * 1.0 +
shared_data[IDX(spitch,sx+1,sy-1)] * -2.0 +
shared_data[IDX(spitch,sx-1,sy+1)] * 2.0 +
shared_data[IDX(spitch,sx ,sy+1)] * -1.0 +
shared_data[IDX(spitch,sx+1,sy+1)] * 1.0)/9.0f;
}
gvECode gvSobelImageSharedMem_f(gvImage* pDest, gvImage* pSrc, int dir)
{
gvECode ret;
GV_CHECK_COMPAT(pDest,NTDEV_IMTYPE,GV_DEPTH_32F,1);
GV_CHECK_COMPAT(pSrc,TDEV_IMTYPE,GV_DEPTH_32F,1);
CUDA_SAFE_CALL(cudaBindTextureToArray(texData,(cudaArray*)(pSrc->data)));
//image block is 14x14 for each thread block of 16x16
int2 grid_size;
grid_size.x = iDivUp(pDest->width, (BLOCK_SIZE - 2));
grid_size.y = iDivUp(pDest->height, (BLOCK_SIZE - 2));
dim3 threads(BLOCK_SIZE, BLOCK_SIZE);
dim3 grid(grid_size.x, grid_size.y);
conv_sobel_sharedmem_dx_f<<<grid,threads>>>((float*)(pDest->pixels), pDest->pitch, pDest->width, pDest->height);
// check if kernel execution generated and error
CUT_CHECK_ERROR("Kernel execution failed");
CUDA_SAFE_CALL( cudaThreadSynchronize() );
return GV_SUCCESS;
}
:( :(