help with kernel launch (bilateral filter)

I have a GeForce 8400Gs card… i tried implementing bilateral filter from the sdk using opencv. But the filter is not getting applied… the program runs smoothly but the kernel is not launched!! can some1 check the code here…

#include <stdio.h>

#include <math.h>

#define uint unsigned int

texture<uchar4, 2, cudaReadModeNormalizedFloat> rgbaTex;

texture<float, 1, cudaReadModeElementType> gaussianTex;

cudaArray* d_array, *d_tempArray, *d_gaussianArray;

uint * d_img = NULL;

uint *d_result = NULL;

__device__ float euclideanLen(float4 a, float4 b, float d)

{

float mod = (b.x - a.x) * (b.x - a.x) +

                (b.y - a.y) * (b.y - a.y) +

                (b.z - a.z) * (b.z - a.z) +

                (b.w - a.w) * (b.w - a.w);

return __expf(-mod / (2 * d * d));

}

__device__ uint rgbaFloatToInt(float4 rgba , float div)

{

    rgba.x = __saturatef(fabs(rgba.x/div));   // clamp to [0.0, 1.0]

    rgba.y = __saturatef(fabs(rgba.y/div));

    rgba.z = __saturatef(fabs(rgba.z/div));

    rgba.w = __saturatef(fabs(rgba.w/div));

    return (uint(rgba.w * 255.0f) << 24) | (uint(rgba.z * 255.0f) << 16) | (uint(rgba.y * 255.0f) << 8) | uint(rgba.x * 255.0f);

}

__global__ void d_bilateral_filter(uint *od, float e_d, int w, int h, int r)

{

    uint x = __umul24(blockIdx.x, blockDim.x) + threadIdx.x;

    uint y = __umul24(blockIdx.y, blockDim.y) + threadIdx.y;

	if (x < w && y < h)

	{

        float sum = 0.0f;

        float factor;

        float4 t = make_float4( 0.0f, 0.0f, 0.0f, 0.0f);

        float4 center = tex2D(rgbaTex, x, y);

for(int i = -r; i <= r; i++)

        {

            for(int j = -r; j <= r; j++)

            {

                float4 curPix = tex2D(rgbaTex, x + j, y + i);

                factor = (tex1D(gaussianTex, i + r) * tex1D(gaussianTex, j + r)) *     //domain factor

                    euclideanLen(curPix, center, e_d); //range factor

				t.x += curPix.x * factor;

				t.y += curPix.y * factor;

				t.z += curPix.z * factor;

				t.w += curPix.w * factor;

                sum += factor;

            }

        }

        od[y * w + x] = rgbaFloatToInt( t , sum);

    }

}

void checkErrors(char* label)

{

	cudaError_t err;

	err = cudaThreadSynchronize();

	if (err != cudaSuccess)

	{

		char* e = (char*) cudaGetErrorString(err);

		fprintf(stderr, "\nCUDA Error: %s (at %s)\n", e, label);

	}

	err = cudaGetLastError();

	if (err != cudaSuccess)

	{

		char* e = (char*) cudaGetErrorString(err);

		fprintf(stderr, "\nCUDA Error: %s (at %s)\n", e, label);

	}

}

extern "C"

{

	void initCUDA(int w, int h, uchar4 *pImage)

	{

		int deviceCount;

		cudaGetDeviceCount(&deviceCount);

		if (deviceCount == 0)

		{

			printf("Sorry, no CUDA device found!!\nExiting the program...");

			exit(0);

		}

		

		cudaMalloc( (void**) &d_img,  ( w * h * sizeof(unsigned int)) );

		cudaMalloc( (void **)&d_result, ( w * h *sizeof(unsigned int)) );

		checkErrors("memory allocation");

		cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsigned);

		cudaMallocArray  ( &d_array, &channelDesc, w, h );

		cudaMallocArray  ( &d_tempArray, &channelDesc, w, h );

		cudaMemcpyToArray( d_array, 0, 0, pImage, (w * h * sizeof(unsigned int)), cudaMemcpyHostToDevice);

		checkErrors("copy data to device");

	}

	void stopCUDA(void)

	{

		cudaFreeArray(d_array);

		cudaFreeArray(d_tempArray);

		cudaFreeArray(d_gaussianArray);

		cudaFree(d_img);

	}

	void runCUDABilateral(uchar4 *h_img, int width, int height, float e_d, int radius, int iterations, int nthreads)

	{

		cudaBindTextureToArray(rgbaTex, d_array);

		for(int i=0; i<iterations; i++) 

		{

			dim3 gridSize((width + 16 - 1) / 16, (width + 16 - 1) / 16);

			dim3 blockSize(16, 16);

			d_bilateral_filter<<< gridSize, blockSize>>>( d_result, e_d, width, height, radius );

			checkErrors("kernel launch");

			if (iterations > 1)

			{

	            cudaMemcpyToArray( d_tempArray, 0, 0, d_result, width * height * sizeof(float), cudaMemcpyDeviceToDevice);

				cudaBindTextureToArray(rgbaTex, d_tempArray);

			}

		}

	

		cudaMemcpy(h_img, d_result, width*height*sizeof(unsigned int), cudaMemcpyDeviceToHost);

		checkErrors("copy data from device");

	}

}

the first 3 functions are exactly frm sdk… I call runCUDABilateral() from host passing image param… initCUDA() is called first…

the last 2 errors “kernel launch” & “copy data FROM device” are showing up… no probs with “memory alloc” n “copy data TO device”…

can some1 tell me why is this happening?

thanx :)

I am new to this forum… Did i make any mistake in etiquette?? :|

Code is much nicer to read in this forum if you enclose it in [code]…[/code] tags. Maybe you get more people to look at it if you edit your post accordingly.

Having said that, on a (very) quick glance I don’t see anything wrong with the code. Maybe the problem is elsewhere. Are you compiling for compute capability 1.0 (sm_10)? Can you compile and successfully run examples from the SDK?

the sdk code runs perfectly on vs2008… i tried to debug this code using nsight tool, and the nsight debugger itself crashes and makes windows unusable…