Blob detection with cuda-c

Hello everyone,
I am trying to implement a blob detector. Here I found the algorithm http://hpcg.purdue.edu/papers/Stava2011CCL.pdf . My implementation dont works and I can’t find the error. So please take a look at my code.

#include <fstream>
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <assert.h>
#include <limits.h>

#define PGMHeaderSize  			0x40

inline bool loadPPM(const char *file, unsigned char **data, unsigned int *w, unsigned int *h, unsigned int *channels, unsigned char *max)
{
    FILE *fp = NULL;

    fp = fopen(file, "rb");
    if (!fp) {
        fprintf(stderr, "__LoadPPM() : unable to open file\n" );
		return false;
    }

    // check header
    char header[PGMHeaderSize];

    if (fgets(header, PGMHeaderSize, fp) == NULL)
    {
        fprintf(stderr,"__LoadPPM() : reading PGM header returned NULL\n" );
        return false;
    }

    if (strncmp(header, "P5", 2) == 0)
    {
        *channels = 1;
    }
    else if (strncmp(header, "P6", 2) == 0)
    {
        *channels = 3;
    }
    else
    {
        fprintf(stderr,"__LoadPPM() : File is not a PPM or PGM image\n" );
        *channels = 0;
        return false;
    }

    // parse header, read maxval, width and height
    unsigned int width = 0;
    unsigned int height = 0;
    unsigned int maxval = 0;
    unsigned int i = 0;

    while (i < 3)
    {
        if (fgets(header, PGMHeaderSize, fp) == NULL)
        {
            fprintf(stderr,"__LoadPPM() : reading PGM header returned NULL\n" );
            return false;
        }

        if (header[0] == '#')
        {
            continue;
        }

        if (i == 0)
        {
            i += sscanf(header, "%u %u %u", &width, &height, &maxval);
        }
        else if (i == 1)
        {
            i += sscanf(header, "%u %u", &height, &maxval);
        }
        else if (i == 2)
        {
            i += sscanf(header, "%u", &maxval);
        }
    }

    // check if given handle for the data is initialized
    if (NULL != *data)
    {
        if (*w != width || *h != height)
        {
            fprintf(stderr, "__LoadPPM() : Invalid image dimensions.\n" );
            return false;
        }
    }
    else
    {
        *data = (unsigned char *) malloc(sizeof(unsigned char) * width * height * *channels);
        if (!data) {
         fprintf(stderr, "__LoadPPM() : Unable to allocate hostmemory\n");
         return false;
		}
        *w = width;
        *h = height;
        *max = maxval;
    }

    // read and close file
    if (fread(*data, sizeof(unsigned char), width * height * *channels, fp) == 0)
    {
        fprintf(stderr, "__LoadPPM() : read data returned error.\n" );
        fclose(fp);
        return false;
    }

    fclose(fp);

    return true;
}

inline bool savePPM(const char *file, unsigned char *data, unsigned int w, unsigned int h, unsigned int channels)
{
    assert(NULL != data);
    assert(w > 0);
    assert(h > 0);

    std::fstream fh(file, std::fstream::out | std::fstream::binary);

    if (fh.bad())
    {
        fprintf(stderr, "__savePPM() : Opening file failed.\n" );
        return false;
    }

    if (channels == 1)
    {
        fh << "P5\n";
    }
    else if (channels == 3)
    {
        fh << "P6\n";
    }
    else
    {
        fprintf(stderr, "__savePPM() : Invalid number of channels.\n" );
        return false;
    }

    fh << w << "\n" << h << "\n" << 0xff << std::endl;

    for (unsigned int i = 0; (i < (w*h*channels)) && fh.good(); ++i)
    {
        fh << data[i];
    }

    fh.flush();

    if (fh.bad())
    {
        fprintf(stderr,"__savePPM() : Writing data failed.\n" );
        return false;
    }

    fh.close();

    return true;
}

#define TILE_W		14
#define TILE_H		14
#define BLOCK_W		(TILE_W+(2))
#define BLOCK_H		(TILE_H+(2))
#define BLOCK_S		BLOCK_W*BLOCK_H
#define DIV			4					// Divider for Blob definition

__global__ void UpdateBlock (unsigned int *Labels, const unsigned int *Heads, 
const unsigned int width, const unsigned int hight, bool *Labelchanges) {
	unsigned int i = threadIdx.x + blockIdx.x * blockDim.x;
	if(i>=width*hight) return;
	
	if(i==0) 	*Labelchanges=false;
		
	/**          UpdateBorders         */
			
	if(Labels[i]<UINT_MAX){	//Blob
		unsigned int head = Heads[i];	// global idx of my head
		Labels[i]=Labels[ head ];
	}
}

__global__ void BlockLabeling (const unsigned char *picture, unsigned int *Labels, unsigned int *Heads,
const unsigned int width, const unsigned int hight, const unsigned char maxval, bool *Labelchanges){
	//Indexes
	const int x = blockIdx.x * TILE_W + threadIdx.x - 1;				// x image location
	const int y = blockIdx.y * TILE_H + threadIdx.y - 1;				// y image location
	const unsigned int lidx = threadIdx.y * BLOCK_W + threadIdx.x;		// local index
	
	__shared__ unsigned int sLab[BLOCK_S];			
	__shared__ unsigned int sMsg[3];			// { lock, index, value }
	__shared__ bool sChanged;
		
	// initialize
	if(threadIdx.x==0 && threadIdx.y==0){
		sMsg[0] = 0;
	}
	
	// Threads which are not in the picture just write "no label" to sLab
	if(x<0 || y<0 || x>=width || y>=hight){		
		sLab[lidx]=UINT_MAX;
		return;
	}
		
	const unsigned int gidx = y * width + x;							// global index
	const bool intile = (threadIdx.x >= 1) && (threadIdx.x < (BLOCK_W-1)) && (threadIdx.y >= 1) && (threadIdx.y < (BLOCK_H-1));
	const bool blob = picture[gidx] < maxval/DIV;						// 0= black, maxval= white
	
	if(gidx==0) 	*Labelchanges=false;
			
	if(blob && intile)		sLab[lidx] = gidx;
	else 					sLab[lidx] = UINT_MAX;
		
	__syncthreads();
		
	/**       Labeling       */
	if (blob && intile) {
		unsigned int nidx, minval;
		int dy, dx;
		
		while(1){
			__syncthreads();
			sChanged=false;					// reset sChanged to false
			__syncthreads();
			
			//*********** Phase 1 : Check neighbor Labels ****************************************
			if(sLab[lidx]==gidx){
				// check, if blob- head should be changed
				if(sMsg[0]==1 && sMsg[1]==gidx){  
					minval = sMsg[2];
					
				}
				else minval=UINT_MAX;
				
								
				// go through neighbors 
				for(dy=-1;dy<=1;dy++){
					for(dx=-1;dx<=1;dx++){
						nidx = lidx + dy*BLOCK_W + dx;
						if(sLab[nidx]<minval) minval=sLab[nidx];
					}
				}
				
				__syncthreads();		// sync before writing
								
				if(minval < sLab[lidx]){	// change the label
					sChanged = true;
					sLab[lidx] = minval;
					
				}
			}
			
			__syncthreads();		
			sMsg[0]=0;			// reset lock in Msg
			__syncthreads();	// sync with all threads
						
			if(!sChanged)		break;		// if no thread changed anything, break the while
						
			//*********** Phase 2 : find root to blob- head ****************************************
				
			// find root
			unsigned int lroot = lidx, groot = gidx, loff, xroot, yroot;			// local and global root idx, offset to own idx, x and y position of root in picture
					
			while(sLab[lroot] != groot){
				groot = sLab[lroot];
				xroot = groot % width;
				yroot = groot / width;
				dx = x - xroot;
				dy = y - yroot;
				loff = dy * BLOCK_W + dx;
				lroot = lidx - loff;		// lroot + loff = lidx			
			}
			
			__syncthreads();		// sync before writing
			sLab[lidx]=groot;
			__syncthreads();
			
			
			//*********** Phase 3 : force blob- head- change if necessary ****************************************
					
			// go through neighbors 
			minval = UINT_MAX;
			for(dy=-1;dy<=1;dy++){				
				for(dx=-1;dx<=1;dx++){
					nidx = lidx + dy*BLOCK_W + dx;
					if(sLab[nidx]<minval) minval=sLab[nidx];
				}
			}
			if(minval < sLab[lidx]){
				// force other tread to change the head
				unsigned int old_lock = atomicExch(&(sMsg[0]),1);		// read and set lock atomically
				if(old_lock == 0){										// check, if lock was set before
					sMsg[1] = sLab[lidx];
					sMsg[2] = minval;
				}
			}
			__syncthreads();
		}
		// writing results to output
		Heads[gidx]=sLab[lidx];
		Labels[gidx]=sLab[lidx];
	}
}

__global__ void BorderMerge (unsigned int *Labels, const unsigned int *Heads, 
const unsigned int width, const unsigned int hight, bool *Labelchanges) {
	//Indexes
	const int x = blockIdx.x * TILE_W + threadIdx.x - 1;				// x image location
	const int y = blockIdx.y * TILE_H + threadIdx.y - 1;				// y image location
	const unsigned int lidx = threadIdx.y * BLOCK_W + threadIdx.x;		// local index
	
	__shared__ unsigned int sLab[BLOCK_S];	
	__shared__ unsigned int sMsg[3];			// { lock, index, value }
	__shared__ bool sChanged;
	__shared__ bool sFirstChange;
	
	if(threadIdx.x==0 && threadIdx.y==0){
		sChanged = true;
		sFirstChange = true;
		sMsg[0] = 0;
	}
			
	// Threads which are not in the picture just write "no label" to sLab
	if(x<0 || y<0 || x>=width || y>=hight){		
		sLab[lidx]=UINT_MAX;
		return;
	}
		
	const unsigned int gidx = y * width + x;							// global index
	
	sLab[lidx] = Labels[gidx];
	
	const unsigned int head = Heads[gidx];								// gidx of head of the pixel
	const bool intile = (threadIdx.x >= 1) && (threadIdx.x < (BLOCK_W-1)) && (threadIdx.y >= 1) && (threadIdx.y < (BLOCK_H-1));
	const bool blob = sLab[lidx]<UINT_MAX;	
			
	__syncthreads();
		
	/**          BorderMerge         */
	 
	if(blob && intile){
		unsigned int minval, nidx;
		int dx, dy;
		
		while(sChanged){
			__syncthreads();
			sChanged=false;	
			__syncthreads();
			
			if(sMsg[0]==1 && sMsg[1]==gidx){  
				minval = sMsg[2];
			}
			else		minval=UINT_MAX;
		
			//*****************   Merging rows ******************************************
			if(y%TILE_H==0 && y>0){
				dy=-1;
				for(dx=-1;dx<=1;dx++){
					nidx = lidx + dy*BLOCK_W + dx;
					if(sLab[nidx]<minval)  		minval = sLab[nidx];
				}	
			}
	
			//*****************   Merging cols ******************************************
			else if(x%TILE_W==0 && x>0){
				dx=-1;
				for(dy=-1;dy<=1;dy++){
					nidx = lidx + dy*BLOCK_W + dx;
					if(sLab[nidx]<minval)  		minval = sLab[nidx];
				}
			}			
			
			__syncthreads();		
			sMsg[0]=0;			// reset lock in Msg
			__syncthreads();	// sync with all threads before writing
			
			//*****************   Set Labels ******************************************
			
			if(minval<sLab[lidx]){
				sChanged=true;
				if(head==gidx){				// I am head
					sLab[lidx]=minval;
					if(sFirstChange){
						sFirstChange=false;		// For changing *Labelchanges only once
						*Labelchanges=true;
					}
				}
				else{
					// change the Head
					unsigned int old_lock = atomicExch(&(sMsg[0]),1);		// read and set lock atomically
					if(old_lock == 0){		// check, if lock was set before
						sMsg[1] = head;		// my head should change to minval
						sMsg[2] = minval;
															
						sLab[lidx]=minval;	// I change to minval
					}
				}
			}
			__syncthreads();
		}
		__syncthreads();
		
		// Change Label if head
		if(head==gidx)		Labels[gidx]=sLab[lidx];
	}
}

__global__ void write_image
(const unsigned int *Labels, const unsigned char *in, unsigned char *out, const unsigned int width, const unsigned int hight){
	//Indexes
	const int x = blockIdx.x * TILE_W + threadIdx.x - 1;				// x image location
	const int y = blockIdx.y * TILE_H + threadIdx.y - 1;				// y image location
	const unsigned int lidx = threadIdx.y * BLOCK_W + threadIdx.x;		// local index
	
	__shared__ unsigned int sLab[BLOCK_S];		
			
	// Threads which are not in the picture just write "no label" to sLab
	if(x<0 || y<0 || x>=width || y>=hight){		
		sLab[lidx]=UINT_MAX;
		return;
	}
		
	const unsigned int gidx = y * width + x;							// global index
	const bool intile = (threadIdx.x >= 1) && (threadIdx.x < (BLOCK_W-1)) && (threadIdx.y >= 1) && (threadIdx.y < (BLOCK_H-1));
			
	sLab[lidx] = Labels[gidx];
			
	__syncthreads();
	
	/**          Write Imagedata Array        */
	
	unsigned int nidx, own = sLab[lidx];
	bool border=false;	// Border Pixel of a Blob
	if(intile){
		if(own<UINT_MAX){	// Blob
			// go through neighbors
			for(int dy=-1;dy<=1;dy++){				
				for(int dx=-1;dx<=1;dx++){
					nidx = lidx + dy*BLOCK_W + dx;
					if(sLab[nidx]!=own)		border=true;			
				}
			}
		}
	
		if(border){				// At blob-border
			out[3*gidx+0]=(unsigned char)( own      % 255);
			out[3*gidx+1]=(unsigned char)((own+50)  % 255);
			out[3*gidx+2]=(unsigned char)((own+100) % 255);
		}
		else{
			unsigned char gray = in[gidx];
			out[3*gidx+0]=gray;
			out[3*gidx+1]=gray;
			out[3*gidx+2]=gray;
		}
	}
	
}

#define checkCudaErrors(err)           __checkCudaErrors (err, __FILE__, __LINE__)

inline void __checkCudaErrors(cudaError err, const char *file, const int line)
{
    if (cudaSuccess != err)
    {
        fprintf(stderr, "%s(%i) : CUDA Runtime API error %d: %s.\n",
                file, line, (int)err, cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }
}

int main(){
    unsigned char *in=NULL, *d_in=NULL, *out=NULL, *d_out=NULL;
    unsigned int *d_lab=NULL, *d_Heads=NULL;
    bool labelchanges=false, *d_labelchanges=NULL;
    unsigned int w,h,channels;
    unsigned char maxval;
        
    // for time measurement
    cudaEvent_t start, stop;
	float time;
	cudaEventCreate(&start);
	cudaEventCreate(&stop);
    
    if(! loadPPM("../../data/pic1.pgm", &in, &w, &h, &channels, &maxval)){	
		fprintf(stderr, "Failed to open File\n");
        exit(EXIT_FAILURE);
	}
	
	printf("Loaded file with   w:%d   h:%d   channels:%d     maxval:%d\n",w,h,channels, maxval);
	
	if(channels != 1){
		fprintf(stderr, "ERROR: Channels must be 1 ! \n");
		free(in);
        exit(EXIT_FAILURE);
	}
	
	unsigned int numElements = w*h;
    size_t picsize	 = numElements * sizeof(unsigned char);
    size_t labsize	 = numElements * sizeof(unsigned int);
    size_t boolsize  = sizeof(bool); 
    
    // Allocate Host Memory
    out=(unsigned char *)malloc(3*picsize);
                   
    // Allocate the Device Memory
    printf("Allocate Devicememory for data\n");
    checkCudaErrors(cudaMalloc((void **)&d_in, picsize));
    checkCudaErrors(cudaMalloc((void **)&d_out, 3*picsize));
    checkCudaErrors(cudaMalloc((void **)&d_lab, labsize));
    checkCudaErrors(cudaMalloc((void **)&d_labelchanges, boolsize));
    checkCudaErrors(cudaMalloc((void **)&d_Heads, labsize));
            
    // Copy to device
    printf("Copy in data from the host memory to the CUDA device\n");
    checkCudaErrors(cudaMemcpy(d_in, in, picsize, cudaMemcpyHostToDevice));
    
    // variables for Kernel launches
    const int threadsPerBlock = 256;				// Maximum 1024 threads per block
    const int blocksPerGrid =(numElements - 1) / threadsPerBlock +1;
    const dim3 threadsPerBlock3(BLOCK_W, BLOCK_H);	
	const dim3 blocksPerGrid3((w-1)/TILE_W +1,(h-1)/TILE_H +1);
	
	printf("CUDA dim1 kernel launches with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);
	printf("CUDA dim3 kernel launches with [%d %d] blocks of [%d %d] threads\n", blocksPerGrid3.x, blocksPerGrid3.y, threadsPerBlock3.x, threadsPerBlock3.y);
       
    cudaEventRecord(start, 0);		// start time measurement
    
    // Launch BlockLabeling
	printf("Launch BlobLabeling Kernel\n");
	BlockLabeling<<<blocksPerGrid3, threadsPerBlock3>>>(d_in, d_lab, d_Heads, w, h, maxval, d_labelchanges  );
	checkCudaErrors(cudaGetLastError());
	cudaDeviceSynchronize();
    
    while(1){
			
		// Launch ChangeHead Kernel
		printf("Launch BorderMerge Kernel\n");   
		BorderMerge<<<blocksPerGrid3, threadsPerBlock3>>>(d_lab, d_Heads, w, h, d_labelchanges);
		checkCudaErrors(cudaGetLastError());
		cudaDeviceSynchronize();
		
		// Copy bool from device to host
		printf("Copy bool\n");
		checkCudaErrors(cudaMemcpy(&labelchanges, d_labelchanges, boolsize, cudaMemcpyDeviceToHost));	
		
		// break, if there where no changes
		if(!labelchanges) break;
		
		// Launch Update Kernel
		printf("Launch UpdateBlock Kernel\n");   
		UpdateBlock<<<blocksPerGrid, threadsPerBlock>>>(d_lab, d_Heads, w, h,  d_labelchanges);
    	checkCudaErrors(cudaGetLastError());
    	cudaDeviceSynchronize();  	
       	
	}
		
	// Launch write-image Kernal
	printf("Launch write-image Kernel\n");  
	write_image<<<blocksPerGrid3, threadsPerBlock3>>>(d_lab, d_in, d_out,  w, h );
	checkCudaErrors(cudaGetLastError());
	cudaDeviceSynchronize();
	
    cudaEventRecord(stop, 0);		// stop time measurement 
    cudaEventSynchronize(stop);		// sync results
	cudaEventElapsedTime(&time, start, stop);
	printf ("Time for the kernels: %f ms\n", time);
           
    // Copy data from device to host
    printf("Copy out data from the CUDA device to the host memory\n");
    checkCudaErrors(cudaMemcpy(out, d_out, 3*picsize, cudaMemcpyDeviceToHost));		
    
    // Free Device memory
    printf("Free Device memory\n");
    checkCudaErrors(cudaFree(d_in));
    checkCudaErrors(cudaFree(d_lab));
    checkCudaErrors(cudaFree(d_labelchanges));
    checkCudaErrors(cudaFree(d_Heads));
    
    // Save Picture
    printf("Save Picture\n");
    bool saved = savePPM("out_blob.ppm", out, w,  h,  3);
           
    printf("Free Host memory\n");
    free(in);
    free(out);
        
    if (!saved){
		fprintf(stderr, "Failed to save File\n");
		exit(EXIT_FAILURE);
	}
                   
    printf("Done\n");
}

The BlockLabeling seems to work fine (comment the while- loop in the main out). When I run the while only ouce, some parts of the picture, which should be in blob, are no more in blob. So the error is probably in the Merging or Updating kernel.
Labels are the global indexes of the pixels. At end, the lowest index in the blob should be the label of all pixels in the blob. In Heads are the original Labels from the BlockLabeling. Changes are only occur in labels. So in Heads are the lowest index in the blob within the Block. I defined Heads[gidx] as “Head” of that pixel. When the head changes in the BorderMerge, the other pixels should follow in the Update- kernel. The sMsg[3] is for informing another thread, that he should change the label of his pixel. I hope, this is the correct way to do that.

I have found the error.

In the BlockLabeling only pixels in Tile should write to output. And in BorderMerge also the lower and right borders has to be checked.

Hello,
I am working on same thing right now. I just wanted to know a bit about performance of this implementation. Does the above works?

It works, but the performance is worse than a C implementation on CPU.

There are some freely available GPU implementation of connected component labeling (CCL) available, see the links at
https://devtalk.nvidia.com/default/topic/915873/labeling-connected-component-labeling-in-cuda
One could consider also using generic CCL methods for graphs (not specialized for images)

But I highly doubt that any of these matches (or even exceeds) the performance of a state-of-the-art CPU labeling algorithm. A C++ implementation of a state of the art CCL algorithm can label a 2K (2000 x 1500 pixel) image in 10 milliseconds on a single CPU core (3.5 Ghz Xeon Haswell) . Try to achieve that on the GPU.

CCL just seems as one of the ‘hard’ algorithms to parallelize and adapt efficently for the GPU. State-of-the-art CCL algorithms often use dynamic datastructure which are hard to port. Maybe NVIDIA comes up with a high-performance implementation of CCL in a future version of its NPP library.

Hello
is there now any update in cuda containing implementation of CCL
there is Cuda based CCL but that doesnt work.
It is hard to implement CCL and get better result than CPU.

NPP provides now a method for connected component calculation, see https://docs.nvidia.com/cuda/npp/group__image__filter__label__markers.html .

But I am not sure whether you get the ‘usual’ speedup, as the CPU routines are quite advanced and very fast (see https://github.com/prittt/YACCLAB )