Hello everyone,
I am trying to implement a blob detector. Here I found the algorithm http://hpcg.purdue.edu/papers/Stava2011CCL.pdf . My implementation dont works and I can’t find the error. So please take a look at my code.
#include <fstream>
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <assert.h>
#include <limits.h>
#define PGMHeaderSize 0x40
inline bool loadPPM(const char *file, unsigned char **data, unsigned int *w, unsigned int *h, unsigned int *channels, unsigned char *max)
{
FILE *fp = NULL;
fp = fopen(file, "rb");
if (!fp) {
fprintf(stderr, "__LoadPPM() : unable to open file\n" );
return false;
}
// check header
char header[PGMHeaderSize];
if (fgets(header, PGMHeaderSize, fp) == NULL)
{
fprintf(stderr,"__LoadPPM() : reading PGM header returned NULL\n" );
return false;
}
if (strncmp(header, "P5", 2) == 0)
{
*channels = 1;
}
else if (strncmp(header, "P6", 2) == 0)
{
*channels = 3;
}
else
{
fprintf(stderr,"__LoadPPM() : File is not a PPM or PGM image\n" );
*channels = 0;
return false;
}
// parse header, read maxval, width and height
unsigned int width = 0;
unsigned int height = 0;
unsigned int maxval = 0;
unsigned int i = 0;
while (i < 3)
{
if (fgets(header, PGMHeaderSize, fp) == NULL)
{
fprintf(stderr,"__LoadPPM() : reading PGM header returned NULL\n" );
return false;
}
if (header[0] == '#')
{
continue;
}
if (i == 0)
{
i += sscanf(header, "%u %u %u", &width, &height, &maxval);
}
else if (i == 1)
{
i += sscanf(header, "%u %u", &height, &maxval);
}
else if (i == 2)
{
i += sscanf(header, "%u", &maxval);
}
}
// check if given handle for the data is initialized
if (NULL != *data)
{
if (*w != width || *h != height)
{
fprintf(stderr, "__LoadPPM() : Invalid image dimensions.\n" );
return false;
}
}
else
{
*data = (unsigned char *) malloc(sizeof(unsigned char) * width * height * *channels);
if (!data) {
fprintf(stderr, "__LoadPPM() : Unable to allocate hostmemory\n");
return false;
}
*w = width;
*h = height;
*max = maxval;
}
// read and close file
if (fread(*data, sizeof(unsigned char), width * height * *channels, fp) == 0)
{
fprintf(stderr, "__LoadPPM() : read data returned error.\n" );
fclose(fp);
return false;
}
fclose(fp);
return true;
}
inline bool savePPM(const char *file, unsigned char *data, unsigned int w, unsigned int h, unsigned int channels)
{
assert(NULL != data);
assert(w > 0);
assert(h > 0);
std::fstream fh(file, std::fstream::out | std::fstream::binary);
if (fh.bad())
{
fprintf(stderr, "__savePPM() : Opening file failed.\n" );
return false;
}
if (channels == 1)
{
fh << "P5\n";
}
else if (channels == 3)
{
fh << "P6\n";
}
else
{
fprintf(stderr, "__savePPM() : Invalid number of channels.\n" );
return false;
}
fh << w << "\n" << h << "\n" << 0xff << std::endl;
for (unsigned int i = 0; (i < (w*h*channels)) && fh.good(); ++i)
{
fh << data[i];
}
fh.flush();
if (fh.bad())
{
fprintf(stderr,"__savePPM() : Writing data failed.\n" );
return false;
}
fh.close();
return true;
}
#define TILE_W 14
#define TILE_H 14
#define BLOCK_W (TILE_W+(2))
#define BLOCK_H (TILE_H+(2))
#define BLOCK_S BLOCK_W*BLOCK_H
#define DIV 4 // Divider for Blob definition
__global__ void UpdateBlock (unsigned int *Labels, const unsigned int *Heads,
const unsigned int width, const unsigned int hight, bool *Labelchanges) {
unsigned int i = threadIdx.x + blockIdx.x * blockDim.x;
if(i>=width*hight) return;
if(i==0) *Labelchanges=false;
/** UpdateBorders */
if(Labels[i]<UINT_MAX){ //Blob
unsigned int head = Heads[i]; // global idx of my head
Labels[i]=Labels[ head ];
}
}
__global__ void BlockLabeling (const unsigned char *picture, unsigned int *Labels, unsigned int *Heads,
const unsigned int width, const unsigned int hight, const unsigned char maxval, bool *Labelchanges){
//Indexes
const int x = blockIdx.x * TILE_W + threadIdx.x - 1; // x image location
const int y = blockIdx.y * TILE_H + threadIdx.y - 1; // y image location
const unsigned int lidx = threadIdx.y * BLOCK_W + threadIdx.x; // local index
__shared__ unsigned int sLab[BLOCK_S];
__shared__ unsigned int sMsg[3]; // { lock, index, value }
__shared__ bool sChanged;
// initialize
if(threadIdx.x==0 && threadIdx.y==0){
sMsg[0] = 0;
}
// Threads which are not in the picture just write "no label" to sLab
if(x<0 || y<0 || x>=width || y>=hight){
sLab[lidx]=UINT_MAX;
return;
}
const unsigned int gidx = y * width + x; // global index
const bool intile = (threadIdx.x >= 1) && (threadIdx.x < (BLOCK_W-1)) && (threadIdx.y >= 1) && (threadIdx.y < (BLOCK_H-1));
const bool blob = picture[gidx] < maxval/DIV; // 0= black, maxval= white
if(gidx==0) *Labelchanges=false;
if(blob && intile) sLab[lidx] = gidx;
else sLab[lidx] = UINT_MAX;
__syncthreads();
/** Labeling */
if (blob && intile) {
unsigned int nidx, minval;
int dy, dx;
while(1){
__syncthreads();
sChanged=false; // reset sChanged to false
__syncthreads();
//*********** Phase 1 : Check neighbor Labels ****************************************
if(sLab[lidx]==gidx){
// check, if blob- head should be changed
if(sMsg[0]==1 && sMsg[1]==gidx){
minval = sMsg[2];
}
else minval=UINT_MAX;
// go through neighbors
for(dy=-1;dy<=1;dy++){
for(dx=-1;dx<=1;dx++){
nidx = lidx + dy*BLOCK_W + dx;
if(sLab[nidx]<minval) minval=sLab[nidx];
}
}
__syncthreads(); // sync before writing
if(minval < sLab[lidx]){ // change the label
sChanged = true;
sLab[lidx] = minval;
}
}
__syncthreads();
sMsg[0]=0; // reset lock in Msg
__syncthreads(); // sync with all threads
if(!sChanged) break; // if no thread changed anything, break the while
//*********** Phase 2 : find root to blob- head ****************************************
// find root
unsigned int lroot = lidx, groot = gidx, loff, xroot, yroot; // local and global root idx, offset to own idx, x and y position of root in picture
while(sLab[lroot] != groot){
groot = sLab[lroot];
xroot = groot % width;
yroot = groot / width;
dx = x - xroot;
dy = y - yroot;
loff = dy * BLOCK_W + dx;
lroot = lidx - loff; // lroot + loff = lidx
}
__syncthreads(); // sync before writing
sLab[lidx]=groot;
__syncthreads();
//*********** Phase 3 : force blob- head- change if necessary ****************************************
// go through neighbors
minval = UINT_MAX;
for(dy=-1;dy<=1;dy++){
for(dx=-1;dx<=1;dx++){
nidx = lidx + dy*BLOCK_W + dx;
if(sLab[nidx]<minval) minval=sLab[nidx];
}
}
if(minval < sLab[lidx]){
// force other tread to change the head
unsigned int old_lock = atomicExch(&(sMsg[0]),1); // read and set lock atomically
if(old_lock == 0){ // check, if lock was set before
sMsg[1] = sLab[lidx];
sMsg[2] = minval;
}
}
__syncthreads();
}
// writing results to output
Heads[gidx]=sLab[lidx];
Labels[gidx]=sLab[lidx];
}
}
__global__ void BorderMerge (unsigned int *Labels, const unsigned int *Heads,
const unsigned int width, const unsigned int hight, bool *Labelchanges) {
//Indexes
const int x = blockIdx.x * TILE_W + threadIdx.x - 1; // x image location
const int y = blockIdx.y * TILE_H + threadIdx.y - 1; // y image location
const unsigned int lidx = threadIdx.y * BLOCK_W + threadIdx.x; // local index
__shared__ unsigned int sLab[BLOCK_S];
__shared__ unsigned int sMsg[3]; // { lock, index, value }
__shared__ bool sChanged;
__shared__ bool sFirstChange;
if(threadIdx.x==0 && threadIdx.y==0){
sChanged = true;
sFirstChange = true;
sMsg[0] = 0;
}
// Threads which are not in the picture just write "no label" to sLab
if(x<0 || y<0 || x>=width || y>=hight){
sLab[lidx]=UINT_MAX;
return;
}
const unsigned int gidx = y * width + x; // global index
sLab[lidx] = Labels[gidx];
const unsigned int head = Heads[gidx]; // gidx of head of the pixel
const bool intile = (threadIdx.x >= 1) && (threadIdx.x < (BLOCK_W-1)) && (threadIdx.y >= 1) && (threadIdx.y < (BLOCK_H-1));
const bool blob = sLab[lidx]<UINT_MAX;
__syncthreads();
/** BorderMerge */
if(blob && intile){
unsigned int minval, nidx;
int dx, dy;
while(sChanged){
__syncthreads();
sChanged=false;
__syncthreads();
if(sMsg[0]==1 && sMsg[1]==gidx){
minval = sMsg[2];
}
else minval=UINT_MAX;
//***************** Merging rows ******************************************
if(y%TILE_H==0 && y>0){
dy=-1;
for(dx=-1;dx<=1;dx++){
nidx = lidx + dy*BLOCK_W + dx;
if(sLab[nidx]<minval) minval = sLab[nidx];
}
}
//***************** Merging cols ******************************************
else if(x%TILE_W==0 && x>0){
dx=-1;
for(dy=-1;dy<=1;dy++){
nidx = lidx + dy*BLOCK_W + dx;
if(sLab[nidx]<minval) minval = sLab[nidx];
}
}
__syncthreads();
sMsg[0]=0; // reset lock in Msg
__syncthreads(); // sync with all threads before writing
//***************** Set Labels ******************************************
if(minval<sLab[lidx]){
sChanged=true;
if(head==gidx){ // I am head
sLab[lidx]=minval;
if(sFirstChange){
sFirstChange=false; // For changing *Labelchanges only once
*Labelchanges=true;
}
}
else{
// change the Head
unsigned int old_lock = atomicExch(&(sMsg[0]),1); // read and set lock atomically
if(old_lock == 0){ // check, if lock was set before
sMsg[1] = head; // my head should change to minval
sMsg[2] = minval;
sLab[lidx]=minval; // I change to minval
}
}
}
__syncthreads();
}
__syncthreads();
// Change Label if head
if(head==gidx) Labels[gidx]=sLab[lidx];
}
}
__global__ void write_image
(const unsigned int *Labels, const unsigned char *in, unsigned char *out, const unsigned int width, const unsigned int hight){
//Indexes
const int x = blockIdx.x * TILE_W + threadIdx.x - 1; // x image location
const int y = blockIdx.y * TILE_H + threadIdx.y - 1; // y image location
const unsigned int lidx = threadIdx.y * BLOCK_W + threadIdx.x; // local index
__shared__ unsigned int sLab[BLOCK_S];
// Threads which are not in the picture just write "no label" to sLab
if(x<0 || y<0 || x>=width || y>=hight){
sLab[lidx]=UINT_MAX;
return;
}
const unsigned int gidx = y * width + x; // global index
const bool intile = (threadIdx.x >= 1) && (threadIdx.x < (BLOCK_W-1)) && (threadIdx.y >= 1) && (threadIdx.y < (BLOCK_H-1));
sLab[lidx] = Labels[gidx];
__syncthreads();
/** Write Imagedata Array */
unsigned int nidx, own = sLab[lidx];
bool border=false; // Border Pixel of a Blob
if(intile){
if(own<UINT_MAX){ // Blob
// go through neighbors
for(int dy=-1;dy<=1;dy++){
for(int dx=-1;dx<=1;dx++){
nidx = lidx + dy*BLOCK_W + dx;
if(sLab[nidx]!=own) border=true;
}
}
}
if(border){ // At blob-border
out[3*gidx+0]=(unsigned char)( own % 255);
out[3*gidx+1]=(unsigned char)((own+50) % 255);
out[3*gidx+2]=(unsigned char)((own+100) % 255);
}
else{
unsigned char gray = in[gidx];
out[3*gidx+0]=gray;
out[3*gidx+1]=gray;
out[3*gidx+2]=gray;
}
}
}
#define checkCudaErrors(err) __checkCudaErrors (err, __FILE__, __LINE__)
inline void __checkCudaErrors(cudaError err, const char *file, const int line)
{
if (cudaSuccess != err)
{
fprintf(stderr, "%s(%i) : CUDA Runtime API error %d: %s.\n",
file, line, (int)err, cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
}
int main(){
unsigned char *in=NULL, *d_in=NULL, *out=NULL, *d_out=NULL;
unsigned int *d_lab=NULL, *d_Heads=NULL;
bool labelchanges=false, *d_labelchanges=NULL;
unsigned int w,h,channels;
unsigned char maxval;
// for time measurement
cudaEvent_t start, stop;
float time;
cudaEventCreate(&start);
cudaEventCreate(&stop);
if(! loadPPM("../../data/pic1.pgm", &in, &w, &h, &channels, &maxval)){
fprintf(stderr, "Failed to open File\n");
exit(EXIT_FAILURE);
}
printf("Loaded file with w:%d h:%d channels:%d maxval:%d\n",w,h,channels, maxval);
if(channels != 1){
fprintf(stderr, "ERROR: Channels must be 1 ! \n");
free(in);
exit(EXIT_FAILURE);
}
unsigned int numElements = w*h;
size_t picsize = numElements * sizeof(unsigned char);
size_t labsize = numElements * sizeof(unsigned int);
size_t boolsize = sizeof(bool);
// Allocate Host Memory
out=(unsigned char *)malloc(3*picsize);
// Allocate the Device Memory
printf("Allocate Devicememory for data\n");
checkCudaErrors(cudaMalloc((void **)&d_in, picsize));
checkCudaErrors(cudaMalloc((void **)&d_out, 3*picsize));
checkCudaErrors(cudaMalloc((void **)&d_lab, labsize));
checkCudaErrors(cudaMalloc((void **)&d_labelchanges, boolsize));
checkCudaErrors(cudaMalloc((void **)&d_Heads, labsize));
// Copy to device
printf("Copy in data from the host memory to the CUDA device\n");
checkCudaErrors(cudaMemcpy(d_in, in, picsize, cudaMemcpyHostToDevice));
// variables for Kernel launches
const int threadsPerBlock = 256; // Maximum 1024 threads per block
const int blocksPerGrid =(numElements - 1) / threadsPerBlock +1;
const dim3 threadsPerBlock3(BLOCK_W, BLOCK_H);
const dim3 blocksPerGrid3((w-1)/TILE_W +1,(h-1)/TILE_H +1);
printf("CUDA dim1 kernel launches with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);
printf("CUDA dim3 kernel launches with [%d %d] blocks of [%d %d] threads\n", blocksPerGrid3.x, blocksPerGrid3.y, threadsPerBlock3.x, threadsPerBlock3.y);
cudaEventRecord(start, 0); // start time measurement
// Launch BlockLabeling
printf("Launch BlobLabeling Kernel\n");
BlockLabeling<<<blocksPerGrid3, threadsPerBlock3>>>(d_in, d_lab, d_Heads, w, h, maxval, d_labelchanges );
checkCudaErrors(cudaGetLastError());
cudaDeviceSynchronize();
while(1){
// Launch ChangeHead Kernel
printf("Launch BorderMerge Kernel\n");
BorderMerge<<<blocksPerGrid3, threadsPerBlock3>>>(d_lab, d_Heads, w, h, d_labelchanges);
checkCudaErrors(cudaGetLastError());
cudaDeviceSynchronize();
// Copy bool from device to host
printf("Copy bool\n");
checkCudaErrors(cudaMemcpy(&labelchanges, d_labelchanges, boolsize, cudaMemcpyDeviceToHost));
// break, if there where no changes
if(!labelchanges) break;
// Launch Update Kernel
printf("Launch UpdateBlock Kernel\n");
UpdateBlock<<<blocksPerGrid, threadsPerBlock>>>(d_lab, d_Heads, w, h, d_labelchanges);
checkCudaErrors(cudaGetLastError());
cudaDeviceSynchronize();
}
// Launch write-image Kernal
printf("Launch write-image Kernel\n");
write_image<<<blocksPerGrid3, threadsPerBlock3>>>(d_lab, d_in, d_out, w, h );
checkCudaErrors(cudaGetLastError());
cudaDeviceSynchronize();
cudaEventRecord(stop, 0); // stop time measurement
cudaEventSynchronize(stop); // sync results
cudaEventElapsedTime(&time, start, stop);
printf ("Time for the kernels: %f ms\n", time);
// Copy data from device to host
printf("Copy out data from the CUDA device to the host memory\n");
checkCudaErrors(cudaMemcpy(out, d_out, 3*picsize, cudaMemcpyDeviceToHost));
// Free Device memory
printf("Free Device memory\n");
checkCudaErrors(cudaFree(d_in));
checkCudaErrors(cudaFree(d_lab));
checkCudaErrors(cudaFree(d_labelchanges));
checkCudaErrors(cudaFree(d_Heads));
// Save Picture
printf("Save Picture\n");
bool saved = savePPM("out_blob.ppm", out, w, h, 3);
printf("Free Host memory\n");
free(in);
free(out);
if (!saved){
fprintf(stderr, "Failed to save File\n");
exit(EXIT_FAILURE);
}
printf("Done\n");
}