Application working correctly for a small image but not for larger ones

Hi all,I am developing an application to do Connected Components Labeling on the GPU.

I am basing my implementation from an article at the Journal of Parallel and Distributed Computing by Kalentev et al.

Their implementation uses padding and I want to avoid that,because it would be painful to have all those second and third buffers,so I decided to use conditions instead.

The input is a binary image.

In the first Kernel every pixel in the image that is not zero gets initialized by a unique pixel_id.

In the second Kernel every pixel in the array looks at it’s neighbors and picks the minimum value.

In the third Kernel links labels together.

The third Kernel throws a CUDA error: unspecified launch failure

The debugger says there is an invalid device memory access

PS:In a re-run the algorithm no longer works with the smaller image.

Any help would be greatly appreciated.

here is the code for the kernels

//Labels is the binary image

#include "limits.h"

__global__ void initLabels(unsigned char* Labels_in,unsigned int* Labels,int widthImage,int heightImage)

{

    int x = blockIdx.x *blockDim.x + threadIdx.x;

     int y = blockIdx.y * blockDim.y + threadIdx.y;

if(x<widthImage &&y<heightImage)

{

     Labels[y*(widthImage)+x]=Labels_in[y*(widthImage)+x]*(y*(widthImage)+x);

}    

}

__global__ void Scanning(unsigned int* Labels,int widthImage,int heightImage,bool* IsNotDone)

{

    int x = blockIdx.x *blockDim.x + threadIdx.x;

    int y = blockIdx.y * blockDim.y + threadIdx.y;

unsigned int aPos=y*widthImage+x;

unsigned int newLabel=Labels[aPos];

unsigned int left;

unsigned int right;

unsigned int up;

unsigned int down;

unsigned int minLabel=UINT_MAX;

  if(x<widthImage &&y<heightImage)

{

if(newLabel)

{

if(x!=0)

{

left=Labels[aPos-1];

}

else

{

left=0;

}

if(left)minLabel=left;

if(x!=widthImage-1)

{

right=Labels[aPos+1];

}

else

{

right=0;

}

if(right&&right<minLabel)minLabel=right;

if(y!=0)

{

up=Labels[aPos-widthImage];

}

else

{

up=0;

}

if(up&&up<minLabel)minLabel=up;

if(y!=heightImage-1)

{

down=Labels[aPos+widthImage];

}

if(down&&down<minLabel)minLabel=down;

if(minLabel<newLabel)

{

unsigned int newLabel2=Labels[newLabel];

Labels[newLabel]=(unsigned int)min(newLabel2,minLabel);

IsNotDone[0]=true;

}

}

}

}

__global__ void Analysis(unsigned int* Labels,int widthImage,int heightImage)

{

int x = blockIdx.x *blockDim.x + threadIdx.x;

    int y = blockIdx.y * blockDim.y + threadIdx.y;

unsigned int aPos=y*widthImage+x;

unsigned int newLabel=Labels[aPos];

if(newLabel)

{

unsigned int r=Labels[newLabel];

while(r!=newLabel)

{

newLabel=Labels[r];

r=Labels[newLabel];

}

Labels[aPos]=newLabel;

}

}

And the code for the wrappers:

#include "new_kernels.cu"

#include <thrust/device_vector.h>

#include <thrust/device_ptr.h>

#include "stdio.h"

#include <cutil_inline.h>  

#define DEBUG

#ifdef DEBUG

#define MY_SAFE_CALL(x) cutilSafeCall(x)

#else

#define MY_SAFE_CALL(x) x

#endif

//----------------------------------------

// host code for KERNEL 1

//----------------------------------------

void callInitLabels(unsigned char* Labels_in,unsigned int* Labels,int widthImage,int heightImage,int threadsX,int threadsY)

{

printf("starting initialisation function \n");

	dim3 block(threadsX, threadsY, 1);

	dim3 grid(widthImage / block.x, heightImage / block.y, 1);

	

	initLabels<<<grid, block>>>(Labels_in,Labels,widthImage,heightImage);

	printf("finished initialisation function \n");

}

//----------------------------------------

// host code for KERNEL 2

//----------------------------------------

void callScanning(unsigned int* Labels,int widthImage,int heightImage,bool* IsNotDone,int threadsX,int threadsY)

{

   printf("starting scanning function \n");

	dim3 block(threadsX, threadsY, 1);

	dim3 grid(widthImage / block.x, heightImage / block.y, 1);

	

	Scanning<<<grid, block>>>(Labels,widthImage,heightImage,IsNotDone);

	

	printf("finished scanning function \n");

	cudaThreadSynchronize();

		// check for error

		cudaError error2 = cudaGetLastError();

		if(error2 != cudaSuccess)

		{

			// print the CUDA error message and exit

			printf("CUDA error: %s\n", cudaGetErrorString(error2));

printf("error in scanning function \n");

			exit(-1);

		}

}

void callAnalysis(unsigned int* Labels,int widthImage,int heightImage,int threadsX,int threadsY)

{	printf("starting analysis function \n");

	dim3 block(threadsX, threadsY, 1);

	dim3 grid(widthImage / block.x, heightImage / block.y, 1);

	

	Analysis<<<grid, block>>>(Labels,widthImage,heightImage);

	printf("finished analysis function \n");

		cudaThreadSynchronize();

		// check for error

		cudaError error2 = cudaGetLastError();

		if(error2 != cudaSuccess)

		{

			// print the CUDA error message and exit

			printf("CUDA error: %s\n", cudaGetErrorString(error2));

printf("error in analysis function \n");

			exit(-1);

		}

}

void doCCL(unsigned char* Labels_in,unsigned int * Labels,int widthImage,int heightImage,int threadsX,int threadsY)

{

  callInitLabels(Labels_in,Labels,widthImage,heightImage,threadsX,threadsY);

//using thrust vectors to avoid weird memcpys

thrust::device_vector<bool> isNotDone(1);

bool * raw_done;

isNotDone[0]=true;

  raw_done=thrust::raw_pointer_cast(&isNotDone[0]);

// int max_iter=1;

// int iter_num=1;

   while(isNotDone[0])

  {

  isNotDone[0]=false;

    callScanning(Labels,widthImage,heightImage,raw_done,threadsX,threadsY);

callAnalysis(Labels,widthImage,heightImage,threadsX,threadsY);

//     MY_SAFE_CALL(cudaMemcpy(isNotDone,isNotDoneDevice,sizeof(bool),cudaMemcpyDeviceToHost));

// ++iter_num;   

}

}

Thank you in advance.

Apostolis