Hi all,I am developing an application to do Connected Components Labeling on the GPU.
I am basing my implementation from an article at the Journal of Parallel and Distributed Computing by Kalentev et al.
Their implementation uses padding and I want to avoid that,because it would be painful to have all those second and third buffers,so I decided to use conditions instead.
The input is a binary image.
In the first Kernel every pixel in the image that is not zero gets initialized by a unique pixel_id.
In the second Kernel every pixel in the array looks at it’s neighbors and picks the minimum value.
In the third Kernel links labels together.
The third Kernel throws a CUDA error: unspecified launch failure
The debugger says there is an invalid device memory access
PS:In a re-run the algorithm no longer works with the smaller image.
Any help would be greatly appreciated.
here is the code for the kernels
//Labels is the binary image
#include "limits.h"
__global__ void initLabels(unsigned char* Labels_in,unsigned int* Labels,int widthImage,int heightImage)
{
int x = blockIdx.x *blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
if(x<widthImage &&y<heightImage)
{
Labels[y*(widthImage)+x]=Labels_in[y*(widthImage)+x]*(y*(widthImage)+x);
}
}
__global__ void Scanning(unsigned int* Labels,int widthImage,int heightImage,bool* IsNotDone)
{
int x = blockIdx.x *blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
unsigned int aPos=y*widthImage+x;
unsigned int newLabel=Labels[aPos];
unsigned int left;
unsigned int right;
unsigned int up;
unsigned int down;
unsigned int minLabel=UINT_MAX;
if(x<widthImage &&y<heightImage)
{
if(newLabel)
{
if(x!=0)
{
left=Labels[aPos-1];
}
else
{
left=0;
}
if(left)minLabel=left;
if(x!=widthImage-1)
{
right=Labels[aPos+1];
}
else
{
right=0;
}
if(right&&right<minLabel)minLabel=right;
if(y!=0)
{
up=Labels[aPos-widthImage];
}
else
{
up=0;
}
if(up&&up<minLabel)minLabel=up;
if(y!=heightImage-1)
{
down=Labels[aPos+widthImage];
}
if(down&&down<minLabel)minLabel=down;
if(minLabel<newLabel)
{
unsigned int newLabel2=Labels[newLabel];
Labels[newLabel]=(unsigned int)min(newLabel2,minLabel);
IsNotDone[0]=true;
}
}
}
}
__global__ void Analysis(unsigned int* Labels,int widthImage,int heightImage)
{
int x = blockIdx.x *blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
unsigned int aPos=y*widthImage+x;
unsigned int newLabel=Labels[aPos];
if(newLabel)
{
unsigned int r=Labels[newLabel];
while(r!=newLabel)
{
newLabel=Labels[r];
r=Labels[newLabel];
}
Labels[aPos]=newLabel;
}
}
And the code for the wrappers:
#include "new_kernels.cu"
#include <thrust/device_vector.h>
#include <thrust/device_ptr.h>
#include "stdio.h"
#include <cutil_inline.h>
#define DEBUG
#ifdef DEBUG
#define MY_SAFE_CALL(x) cutilSafeCall(x)
#else
#define MY_SAFE_CALL(x) x
#endif
//----------------------------------------
// host code for KERNEL 1
//----------------------------------------
void callInitLabels(unsigned char* Labels_in,unsigned int* Labels,int widthImage,int heightImage,int threadsX,int threadsY)
{
printf("starting initialisation function \n");
dim3 block(threadsX, threadsY, 1);
dim3 grid(widthImage / block.x, heightImage / block.y, 1);
initLabels<<<grid, block>>>(Labels_in,Labels,widthImage,heightImage);
printf("finished initialisation function \n");
}
//----------------------------------------
// host code for KERNEL 2
//----------------------------------------
void callScanning(unsigned int* Labels,int widthImage,int heightImage,bool* IsNotDone,int threadsX,int threadsY)
{
printf("starting scanning function \n");
dim3 block(threadsX, threadsY, 1);
dim3 grid(widthImage / block.x, heightImage / block.y, 1);
Scanning<<<grid, block>>>(Labels,widthImage,heightImage,IsNotDone);
printf("finished scanning function \n");
cudaThreadSynchronize();
// check for error
cudaError error2 = cudaGetLastError();
if(error2 != cudaSuccess)
{
// print the CUDA error message and exit
printf("CUDA error: %s\n", cudaGetErrorString(error2));
printf("error in scanning function \n");
exit(-1);
}
}
void callAnalysis(unsigned int* Labels,int widthImage,int heightImage,int threadsX,int threadsY)
{ printf("starting analysis function \n");
dim3 block(threadsX, threadsY, 1);
dim3 grid(widthImage / block.x, heightImage / block.y, 1);
Analysis<<<grid, block>>>(Labels,widthImage,heightImage);
printf("finished analysis function \n");
cudaThreadSynchronize();
// check for error
cudaError error2 = cudaGetLastError();
if(error2 != cudaSuccess)
{
// print the CUDA error message and exit
printf("CUDA error: %s\n", cudaGetErrorString(error2));
printf("error in analysis function \n");
exit(-1);
}
}
void doCCL(unsigned char* Labels_in,unsigned int * Labels,int widthImage,int heightImage,int threadsX,int threadsY)
{
callInitLabels(Labels_in,Labels,widthImage,heightImage,threadsX,threadsY);
//using thrust vectors to avoid weird memcpys
thrust::device_vector<bool> isNotDone(1);
bool * raw_done;
isNotDone[0]=true;
raw_done=thrust::raw_pointer_cast(&isNotDone[0]);
// int max_iter=1;
// int iter_num=1;
while(isNotDone[0])
{
isNotDone[0]=false;
callScanning(Labels,widthImage,heightImage,raw_done,threadsX,threadsY);
callAnalysis(Labels,widthImage,heightImage,threadsX,threadsY);
// MY_SAFE_CALL(cudaMemcpy(isNotDone,isNotDoneDevice,sizeof(bool),cudaMemcpyDeviceToHost));
// ++iter_num;
}
}
Thank you in advance.
Apostolis