Hello,
I am developing my first big project in CUDA “Parallel morphological operations on images”. Everything was OK since i started running a cuda kernel for big picture (1600x1200 pixels) for a morphologic mask 125x125 elements.
I have 9600 GT video card
For 512 threads in 512 blocks my computing stopped, graphic card was reset and there were no changes in image. After three days I realized that there was a problem with too long computing of one thread. I increased number of blocks to 3750 and everything should goes fine but it isn’t operation steel freezes. I started to change range of pixels on which are morphologic operation is made ( for one pixel there are 125x125 compare operations), if i change range to 1200x1200 everything goes OK, so i tried to start kernel two times for a half of image with 3750 blocks and then error occurs again ! I don’t know what to do and days of debugging gives me nothing (algorithm is ok i ran it on CUDA emulator). I can also say that my morphologic algorithm works fine for image 800x600 with mask 125x125. Below is my source code:
My kernel end get_pixel device method:
[codebox]
/************************************************************
*******************************/
device char * get_pixel(char * image, int i, int j, int image_width, int image_height)
{
if (i < 0 || j < 0 || i >= image_height || j >= image_width) {
return NULL;
};
return image + (i*image_width + j);
}
/************************************************************
*******************************/
device char get_pixel_value(char * image, int i, int j, int image_width, int image_height)
{
char * pixel = get_pixel(image, i, j, image_width, image_height);
if (pixel == NULL) {
return GRAY;
};
return *pixel;
}
/************************************************************
*******************************/
// Kernel that executes on the CUDA device
global void g_erode(char * image, char * out_image, int idx_start, int idx_end, int image_width, int image_height, int struct_element_size, int * change)
{
//Wyliczenie ilosci watkow
int threadNumb = BLOCK_SIZE * BLOCK_NUMBER;
change[0] = threadNumb;
int threadId = blockIdx.x * blockDim.x + threadIdx.x;
//int pixels = image_width*image_height;
int pixels = idx_end - idx_start;
int pixelStringSize = pixels/threadNumb;
int pixelMod = pixels % threadNumb;
int idx = 0;
int i = 0;
int j = 0;
int c, d;
int add = 0;
bool bBreak = false;
int diff = (int)(SIZE/2);
if (threadId <= pixels) {
if (threadId < pixelMod) {
idx = idx_start + threadId*(pixelStringSize + 1);
add = 1;
} else {
idx = idx_start + pixelMod*(pixelStringSize +1) + (threadId - pixelMod)*pixelStringSize;
};
for (int k = idx; k < idx + pixelStringSize + add;k++) {
i = (int) k / image_width;
j = k % image_width;
if (get_pixel(image, i, j, image_width, image_height) != NULL) {
for (int x = 0; x < SIZE; x++) {
for (int y = 0; y < SIZE; y++) {
c = i+x-diff;
d = j+y-diff;
if (get_pixel(image, c, d, image_width, image_height) != NULL) {
if (get_pixel_value(image, c, d, image_width, image_height) != BLACK) {
*get_pixel(out_image, i, j, image_width, image_height) = WHITE;
bBreak = true;
break;
};
};
};
};
};
};
};
}
[/codebox]
And the kernel call:
[codebox]
//src is OpenCV object with char* imageData where image is beeing kept
const int IMAGE_WIDTH = src->width;
const int IMAGE_HEIGHT = src->height;
size_t mem_size_image = IMAGE_WIDTH * IMAGE_HEIGHT * sizeof(char);
// Pointer to host & device arrays (src & dst)
char * d_src, * d_dst;
//Allocate memory
cudaMalloc((void **) &d_src, mem_size_image);
cudaMalloc((void **) &d_dst, mem_size_image);
// Copy image from host to device
cudaMemcpy(d_src, src->imageData, mem_size_image, cudaMemcpyHostToDevice);
// Initialize block size and no of blocks
int block_size = BLOCK_SIZE;
int n_blocks = BLOCK_NUMBER;
int ch_size = SIZE*SIZE;
//debug pointer
int * change = (int )malloc(ch_sizesizeof(int));
cudaMalloc((void **) &d_change, ch_size*sizeof(int));
cudaMemcpy(d_change, change, ch_size*sizeof(int), cudaMemcpyHostToDevice);
//Run erosion for first half of picture
g_erode <<< n_blocks, block_size >>> (d_src, d_dst, 0, 960000, IMAGE_WIDTH, IMAGE_HEIGHT, mask[i], d_change);
//Run erosion for second one
g_erode <<< n_blocks, block_size >>> (d_src, d_dst, 960000, 1920000, IMAGE_WIDTH, IMAGE_HEIGHT, mask[i], d_change);
//Copy mem from device to host
cudaMemcpy((*dst)->imageData, d_dst, mem_size_image, cudaMemcpyDeviceToHost);
[/codebox]
As i said when I perform this call the monitor blinks (or even system freezes) and I get ‘The launch timed out and was terminated.’ error. When i comment second line of g_erode everything goes good but erosion goeos only for half of picture.
If anyone could help I will be very thankful.