Hi,
I’m investigating the use of CUDA to improve the real-time performance of a bit of the image processing software for a project I’m working on. I’ve been looking at using NPP and CUFFT but I need to have a region labelling algorithm as part of the processing pipeline.
So I’ve just finished writing the region labeling algorithm (my first CUDA program). I used the 4th Kernel described in section 4 in http://www.massey.ac.nz/~kahawick/cstn/089/cstn-089.pdf except I’m only labeling regions in an image that are non-zero.
I’ve posted the code below. The paper gives a fuller description of the algorithm but since I’m new to CUDA I was wondering if anyone would be able to give me some advice regarding possible optimizations. I am currently using global memory throughout and using a single thread per element so these were two areas that I could possibly improve, by either using shared memory or loading a number of pixels per thread (since some pixels will be zero and the threads with these pixels don’t really do anything).
The problem with using shared memory is that I’m unsure how to access the elements in different blocks, and I’m not really sure what the optimum number of elements to process in each thread would be.
If anyone has any advice that would be brilliant.
#ifndef LABELREGIONSKERNEL_CU
#define LABELREGIONSKERNEL_CU
#include <cutil_inline.h>
#include <math.h>
#include "LabelRegions.h"
#define min_valid(a, b) (a < b ? a == 0 ? b : a : b == 0 ? a : b)
__device__ int md;
__global__
void kernel_bc_initialise(int *pMarkers, int *pLut, int w, int h)
{
// Get the id for the element
unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
unsigned int id = x + y * w;
// if not in range then return
if (x > w-1 || y > h-1)
return;
// initislise the image and lookup table
pLut[id] = pMarkers[id] = pMarkers[id] == 0 ? 0 : id+1;
}
__global__
void kernel_bc_scan_image(int *pSrc, int *pLut, int w, int h)
{
// Get the id for the element
unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
unsigned int id = x + y * w;
int pn, ps, pe, pw, pc, min;
// if not in range then return
if (x > w-1 || y > h-1)
return;
// read the current element id
pc = pSrc[id];
// if the label is zero then return
if (pc == 0)
return;
// get the north, south, east and west pixel values
pn = (y > 0 ? pSrc[x + (y - 1) * w] : 0);
ps = (y < h - 1 ? pSrc[x + (y + 1) * w] : 0);
pw = (x > 0 ? pSrc[(x - 1) + y * w] : 0);
pe = (x < w - 1 ? pSrc[(x + 1) + y * w] : 0);
// find the minimum value of the four points
min = min_valid(pn, pw);
min = min_valid(min, pe);
min = min_valid(min, ps);
// if the minimum label is less than the current then set it
// in the lookup table
if (min < pc && min != 0) {
pLut[pc-1] = min < pLut[pc-1] ? min : pLut[pc-1];
md = 1;
}
}
__global__
void kernel_bc_analyse_lut(int *pSrc, int *pLut, int w, int h)
{
// Get the id for the element
unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
unsigned int id = x + y * w;
int label, ref;
// if not in range then return
if (x > w-1 || y > h-1)
return;
// get the label for the current element
label = pSrc[id];
// if the label is equal to the id
// then this thread takes ownership of
// the label
if (label == id+1) {
// loop through the lookup table to
// find the root label and set that as
// the label for the current element.
ref = label-1;
label = pLut[id];
while (ref != label-1) {
ref = label-1;
label = pLut[ref];
}
pLut[id] = label;
}
}
__global__
void kernel_bc_colour_blobs(int *pSrc, int *pLut, int w, int h)
{
// Get the id for the element
unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
unsigned int id = x + y * w;
int ref;
// if not in range then return
if (x > w-1 || y > h-1)
return;
// Set the value of the element using the lookup table
ref = pLut[pSrc[id]-1];
pSrc[id] = pSrc[id] == 0 ? 0 : pLut[ref-1];
}
__global__
void kernel_bc_pack_colours(int *pSrc, int *pLut, int w, int h)
{
// Get the id for the element
unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
unsigned int id = x + y * w;
int label;
// if not in range then return
if (x > w-1 || y > h-1)
return;
// set the lut value at this point to zero
pLut[id] = 0;
label = pSrc[id];
if (label == id+1) {
// set the lut value at
pLut[label] = label;
}
}
void gpu_label_regions(int *pMarkers, int w, int h, int *pBuffer)
{
int *pMd;
int md = 0;
// Allocate and copy variable
cutilSafeCall(cudaMalloc((void**) &pMd, sizeof(int)));
// copy symbol value
cudaMemcpyToSymbol("md", &md, sizeof(int), 0, cudaMemcpyHostToDevice);
// setup execution parameters
dim3 threads( 8, 8, 1);
dim3 grid( ceil(float(w)/threads.x), ceil(float(w)/threads.y), 1);
// initialise the markers and buffer
kernel_bc_initialise<<< grid, threads >>>(pMarkers, pBuffer, w, h);
// scan the image for the first time to see if we need to do anything
kernel_bc_scan_image<<< grid, threads >>>(pMarkers, pBuffer, w, h);
// get the value of md back from the device
cutilSafeCall(cudaMemcpyFromSymbol(&md, "md", sizeof(int), 0,
cudaMemcpyDeviceToHost));
// continue to loop through until no more changes need to be made
while (md != 0) {
// copy symbol value
cudaMemcpyToSymbol("md", &(md = 0), sizeof(int), 0,
cudaMemcpyHostToDevice);
// analyse the changes in the lookup table
kernel_bc_analyse_lut<<< grid, threads >>>(pMarkers, pBuffer, w, h);
// update the colour of the blobs
kernel_bc_colour_blobs<<< grid, threads >>>(pMarkers, pBuffer, w, h);
// scan the image to see if we need to do anything
kernel_bc_scan_image<<< grid, threads >>>(pMarkers, pBuffer, w, h);
// get the value of md back from the device
cutilSafeCall(cudaMemcpyFromSymbol(&md, "md", sizeof(int), 0,
cudaMemcpyDeviceToHost));
}
// free the device memory
cutilSafeCall(cudaFree(pMd));
}
int main()
{
int *pSrc;
int *pLut;
const int w = 9;
const int h = 9;
// allocate the memory for the host data
int pMarkers[w*h] = {
1, 1, 1, 1, 1, 1, 0, 0, 0,
0, 0, 0, 1, 1, 1, 1, 0, 0,
1, 1, 0, 1, 1, 1, 1, 0, 0,
1, 1, 1, 0, 1, 1, 1, 1, 0,
1, 0, 0, 0, 0, 1, 1, 1, 1,
1, 0, 1, 1, 0, 0, 1, 1, 1,
1, 0, 0, 1, 1, 0, 0, 1, 0,
1, 1, 0, 0, 0, 0, 1, 1, 0,
0, 1, 1, 1, 1, 1, 1, 1, 1
};
int pLookup[w*h];
// Allocate cuda device memory
cutilSafeCall(cudaMalloc((void**) &pSrc, w*h*sizeof(int)));
cutilSafeCall(cudaMalloc((void**) &pLut, w*h*sizeof(int)));
// copy marker array across to the device
cutilSafeCall(cudaMemcpy(pSrc, pMarkers, w*h*sizeof(int), cudaMemcpyHostToDevice));
// label the regions
gpu_label_regions(pSrc, w, h, pLut);
// check if kernel execution generated and error
cutilCheckMsg("Kernel execution failed");
// Copy the marker data back to the host
cutilSafeCall(cudaMemcpy(pMarkers, pSrc, w*h*sizeof(int), cudaMemcpyDeviceToHost));
cutilSafeCall(cudaMemcpy(pLookup, pLut, w*h*sizeof(int), cudaMemcpyDeviceToHost));
// free the device memory
cutilSafeCall(cudaFree(pSrc));
cutilSafeCall(cudaFree(pLut));
// print
for (int j = 0; j < h; ++j) {
for (int i = 0; i < w; ++i) {
cout << pMarkers[i+j*w] << " ";
}
cout << endl;
}
system("pause");
return 0;
}
#endif