Kernel Works in Emulation Mode But Not In Release Mode

Hi,

I’m developing a 1D median filter using CUDA. I’m a reasonably experienced programmer however this is my first attempt at coding using CUDA.

The problem I’m having is that the kernel I’ve developed works properly when compiled in emulation mode, however I’m not getting the correct output when I use the GPU version of the kernel.

I’ve attached the input image and the output image.

I’m using a struct containing four unsigned chars to represent a Pixel.

Here’s the code I’m using:

// dataTypes.h

#pragma once

typedef unsigned char dataType;

typedef struct tag_pixel {

	dataType R;

	dataType G;

	dataType B;

	dataType A;

} Pixel;

// naive1DGPUMedianFilter.cu

#include <stdio.h>

#include <stdlib.h>

#include <cuda.h>

#include "naive1DGPUMedianFilter.h"

#include "utilities.h"

// Filter the data.

// 1. Get a pixel.

// 2. Get the surrounding pixels (using the radius value).

// 3. Sort the pixels and get the median value.

// Note that for .tif files, THE ORIGIN OF THE RASTER IS THE

// LOWER LEFT CORNER!

//============================================================

=============

#define THREADS_PER_BLOCK 100 

#define NUMBER_OF_THREADS 1000000

//============================================================

=============

// Device funtion prototypes.

__device__ dataType sortArray(dataType* array, unsigned int arraySize, unsigned int radius); 

//============================================================

=============

// Filter kernel.

__global__ void naive1DFilterDevice(Pixel* iData, 

									Pixel* oData, 

									dataType* medianArrayR, 

									dataType* medianArrayG, 

									dataType* medianArrayB, 

									dataType* medianArrayA, 

									unsigned int w, 

									unsigned int h, 

									unsigned r) {

	

	int idx = blockIdx.x * blockDim.x + threadIdx.x;

	int dataSize = w * h; 

	int i;

	int j;

	// Setup the median array.

	if(idx >= r && idx <= (dataSize - r)) {							// Centre section of image.

		for(i = idx - r, j = 0; i <= idx + r; i++, j++) {

			medianArrayR[j] = iData[i].R;	

			medianArrayG[j] = iData[i].G;	

			medianArrayB[j] = iData[i].B;	

			medianArrayA[j] = iData[i].A;

//			printf("R Values: %d ", medianArrayR[j]);

//			printf("G Values: %d ", medianArrayG[j]);

//			printf("B Values: %d ", medianArrayB[j]);

//			printf("A Values: %d ", medianArrayA[j]);

		}

		// Sort the median array and return the median value.

		oData[idx].R = sortArray(medianArrayR, (2 * r) + 1, r);

		oData[idx].G = sortArray(medianArrayG, (2 * r) + 1, r);

		oData[idx].B = sortArray(medianArrayB, (2 * r) + 1, r);

		oData[idx].A = sortArray(medianArrayA, (2 * r) + 1, r);

//		printf("Median: %d\n", oData[idx].R);	

//		printf("Median: %d\n", oData[idx].G);	

//		printf("Median: %d\n", oData[idx].B);	

//		printf("Median: %d\n", oData[idx].A);	

	}

}

__device__ dataType sortArray(dataType* array, unsigned int arraySize, unsigned int radius) {

/*

	printf("In sortArray...\n");

	printf("Before sort: ");

	int k;

	for(k = 0; k < arraySize; k++)

		printf("%d ", array[k]);

	printf("\n");

*/	

	int i, j;

	dataType index; 

	

	for (i = 1; i < arraySize; i++) { 

		index = array[i]; 

		j = i; 

		while ((j > 0) && (array[j - 1] > index)) { 

			array[j] = array[j - 1]; 

			j = j - 1; 

		} 

		array[j] = index; 

	} 

/*	

	printf("After sort: ");

	for(k = 0; k < arraySize; k++)

		printf("%d ", array[k]);

	printf("\n");

	printf("Median Value: %d\n", array[radius]);

*/

	return array[radius];

}

//============================================================

=============

// Wrapper for kernel filter algorithm.

void naive1DGPUMedianFilter(Pixel* inputData, Pixel* outputData, unsigned int width, unsigned int height, unsigned int radius) {

	printf("Using naive1DGPUMedianFilter...\n");

	Pixel* inputMemD;

	Pixel* outputMemD;

	dataType* medianArrayRMemD;

	dataType* medianArrayGMemD;

	dataType* medianArrayBMemD;

	dataType* medianArrayAMemD;

	int dataSize = width * height;

	int medianArraySize = (2 * radius) + 1;

	int N = NUMBER_OF_THREADS;

	int threadsPerBlock = THREADS_PER_BLOCK;

	int nBlocks = N / threadsPerBlock + (N % threadsPerBlock == 0 ? 0 : 1);

	// Allocate device memory.

	cudaMalloc((void**) &inputMemD, sizeof(Pixel) * dataSize);

	cudaMalloc((void**) &outputMemD, sizeof(Pixel) * dataSize);

	cudaMalloc((void**) &medianArrayRMemD, sizeof(dataType) * medianArraySize);

	cudaMalloc((void**) &medianArrayGMemD, sizeof(dataType) * medianArraySize);

	cudaMalloc((void**) &medianArrayBMemD, sizeof(dataType) * medianArraySize);

	cudaMalloc((void**) &medianArrayAMemD, sizeof(dataType) * medianArraySize);

	

	// Copy data to device.

	cudaMemcpy(inputMemD, inputData, sizeof(Pixel) * dataSize, cudaMemcpyHostToDevice);

	

	// Call filter kernel.	

	naive1DFilterDevice<<<nBlocks, threadsPerBlock>>>(inputMemD, outputMemD, 

													  medianArrayRMemD, medianArrayGMemD, medianArrayBMemD, medianArrayAMemD,

													  width, height, radius);

	// Copy data to host.

	cudaMemcpy(outputData, outputMemD, sizeof(Pixel) * dataSize, cudaMemcpyDeviceToHost);

/*

	int i;

	for(i = 0; i < dataSize; i++) {

		printf("%d\t", outputData[i].R);

		printf("%d\t", outputData[i].G);

		printf("%d\t", outputData[i].B);

		printf("%d\n", outputData[i].A);

	}

*/

	cudaFree(inputMemD);

	cudaFree(outputMemD);

	cudaFree(medianArrayRMemD);

	cudaFree(medianArrayGMemD);

	cudaFree(medianArrayBMemD);

	cudaFree(medianArrayAMemD);

}

Am I missing soumething obvious here?

Cheers,

Chris
Output.tif (505 KB)
StarryNight.tif (379 KB)

Start by checking to see if all calls are returning what you expect–it’s entirely possible you’re accessing out of bounds memory on the GPU or something.

What would be the best way to do this? Use the CUDA Debugger?

All cuda* calls return a cudaError_t…

Ah…thanks!

I figured out what was going wrong.

I was only allocating one median array for the whole computation, rather than one median array per thread - doh!

Cheers,

Chris