CUDA function does not return to main

I have a simple edge detection function I’m trying to debug but it seems like the function does not return to main after the last instruction. I can see the function computing the correct values in the debugger but I can’t explain why it won’t return to its caller… I’m sure it’s something simple but I just don’t see it. Main() calls mySobelKernel() . It stops at the last line of global void mySobel()

[codebox]#include <vector_functions.h>

#include <device_launch_parameters.h>

#include <device_functions.h>

extern “C”


  uint3 const threadIdx;

  uint3 const blockIdx;

  dim3 const blockDim;

  dim3 const gridDim;

  int const warpSize;


global void mySobel(void* img, void* imgOut, int height, int width, int* GX, int GY[3], int xorder, int yorder, int step)


// int bx = blockIdx.x;

// int by = blockIdx.y;

// Thread index

//int tx = threadIdx.x;

//int ty = threadIdx.y;


int gxd;

int iStart, jStart, iEnd, jEnd;		//index to the start and end of the sobel matrix

int	X, Y;							//pixel indexes

int16_t I, J, SUM;					//mask indexes

int16_t sumX=0, sumY=0;

int8_t* data = (int8_t*)img;	//pointer to the start of the image as a byte

int16_t* dataOut = (int16_t*)imgOut;			//16bit signed data output

for(Y=0; Y < height; Y++)


for(X=0; X < width; X++)




        //calculate where to start indexing sobel matrix- this is only used when the pixel

        //being referenced is on the edge of the image

        iStart= (X==0) ? 0: -1;

        iEnd= (X==width-1)?0: 1;

        jStart= (Y==0) ? 0: -1;

        jEnd= (Y==height-1)? 0: 1;

        //perform x order derivative




	for(I=iStart; I<=iEnd; I++)  


		for(J=jStart; J<=jEnd; J++)  

		{ // //(T*)((char*)BaseAddress + Row * pitch) + Column

                        gxd= ((int *)((char*) GX + (J+1) * step))[I + 1];    //debug line - remove for release

                        //multiplies a part of the image(3x3) with the sobel matrix 3x3

                        sumX += ((unsigned int8_t*)(data + (Y + J)* width))[X + I]* gxd; // GX[J+1][I+1];




	SUM = sumX;	//do either xderiv or yderiv




            //-------Y GRADIENT APPROXIMATION-------//

            for(I=iStart; I<=iEnd; I++)


                for(J=jStart; J<=jEnd; J++)


                    sumY += ((unsigned int8_t*) (data + (Y + J)*width))[X + I]*GY[J+1][I+1];



	SUM = sumY;	


	//write out result to memory destination

	//((__int16*) (imgOut->imageData + Y*imgOut->widthStep))[X] = SUM;

        ((int16_t*) (dataOut + Y*width))[X] = SUM;



SUM=0;      //stops here -fails to return to main


//(void* img, void* imgOut, int height, int width, int widthStep, int GX[3], int GY[3], int xorder, int yorder, int appertureSize)

extern “C” void mySobelKernel(void* src, void* dst, int h, int w, int* XM, int YM[3], int xdev,int ydev, int THREAD_N, int stepG)


#define BLOCK_SIZE 16

// Basic Matrix dimensions (can be amplified by command line switch)

// (chosen as multiples of the thread block size for simplicity)

#define WA (w  * BLOCK_SIZE) // Matrix A width

#define HA (h * BLOCK_SIZE) // Matrix A height

#define WB (w  * BLOCK_SIZE) // Matrix B width

#define HB WA  // Matrix B height

#define WC WB  // Matrix C width

#define HC HA  // Matrix C height

dim3 threads(BLOCK_SIZE, BLOCK_SIZE);

dim3 grid(w / threads.x, h / threads.y);

int threadsPerBlock = 256;

int blocksPerGrid = (THREAD_N + threadsPerBlock - 1) / threadsPerBlock;

mySobel<<<grid, threads>>>(src, dst, h, w, XM, YM, xdev, ydev, stepG);




Bumping after less than 2 hours? Not nice…

Anyway, what do you mean by “fails to return to main”? have you went through all the execution threads in simulation?

P.S. Why is this in “CUDA on Linux” and not in “CUDA Programming and Development”?