I have a GeForce 8400Gs card… i tried implementing bilateral filter from the sdk using opencv. But the filter is not getting applied… the program runs smoothly but the kernel is not launched!! can some1 check the code here…
#include <stdio.h>
#include <math.h>
#define uint unsigned int
texture<uchar4, 2, cudaReadModeNormalizedFloat> rgbaTex;
texture<float, 1, cudaReadModeElementType> gaussianTex;
cudaArray* d_array, *d_tempArray, *d_gaussianArray;
uint * d_img = NULL;
uint *d_result = NULL;
__device__ float euclideanLen(float4 a, float4 b, float d)
{
float mod = (b.x - a.x) * (b.x - a.x) +
(b.y - a.y) * (b.y - a.y) +
(b.z - a.z) * (b.z - a.z) +
(b.w - a.w) * (b.w - a.w);
return __expf(-mod / (2 * d * d));
}
__device__ uint rgbaFloatToInt(float4 rgba , float div)
{
rgba.x = __saturatef(fabs(rgba.x/div)); // clamp to [0.0, 1.0]
rgba.y = __saturatef(fabs(rgba.y/div));
rgba.z = __saturatef(fabs(rgba.z/div));
rgba.w = __saturatef(fabs(rgba.w/div));
return (uint(rgba.w * 255.0f) << 24) | (uint(rgba.z * 255.0f) << 16) | (uint(rgba.y * 255.0f) << 8) | uint(rgba.x * 255.0f);
}
__global__ void d_bilateral_filter(uint *od, float e_d, int w, int h, int r)
{
uint x = __umul24(blockIdx.x, blockDim.x) + threadIdx.x;
uint y = __umul24(blockIdx.y, blockDim.y) + threadIdx.y;
if (x < w && y < h)
{
float sum = 0.0f;
float factor;
float4 t = make_float4( 0.0f, 0.0f, 0.0f, 0.0f);
float4 center = tex2D(rgbaTex, x, y);
for(int i = -r; i <= r; i++)
{
for(int j = -r; j <= r; j++)
{
float4 curPix = tex2D(rgbaTex, x + j, y + i);
factor = (tex1D(gaussianTex, i + r) * tex1D(gaussianTex, j + r)) * //domain factor
euclideanLen(curPix, center, e_d); //range factor
t.x += curPix.x * factor;
t.y += curPix.y * factor;
t.z += curPix.z * factor;
t.w += curPix.w * factor;
sum += factor;
}
}
od[y * w + x] = rgbaFloatToInt( t , sum);
}
}
void checkErrors(char* label)
{
cudaError_t err;
err = cudaThreadSynchronize();
if (err != cudaSuccess)
{
char* e = (char*) cudaGetErrorString(err);
fprintf(stderr, "\nCUDA Error: %s (at %s)\n", e, label);
}
err = cudaGetLastError();
if (err != cudaSuccess)
{
char* e = (char*) cudaGetErrorString(err);
fprintf(stderr, "\nCUDA Error: %s (at %s)\n", e, label);
}
}
extern "C"
{
void initCUDA(int w, int h, uchar4 *pImage)
{
int deviceCount;
cudaGetDeviceCount(&deviceCount);
if (deviceCount == 0)
{
printf("Sorry, no CUDA device found!!\nExiting the program...");
exit(0);
}
cudaMalloc( (void**) &d_img, ( w * h * sizeof(unsigned int)) );
cudaMalloc( (void **)&d_result, ( w * h *sizeof(unsigned int)) );
checkErrors("memory allocation");
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsigned);
cudaMallocArray ( &d_array, &channelDesc, w, h );
cudaMallocArray ( &d_tempArray, &channelDesc, w, h );
cudaMemcpyToArray( d_array, 0, 0, pImage, (w * h * sizeof(unsigned int)), cudaMemcpyHostToDevice);
checkErrors("copy data to device");
}
void stopCUDA(void)
{
cudaFreeArray(d_array);
cudaFreeArray(d_tempArray);
cudaFreeArray(d_gaussianArray);
cudaFree(d_img);
}
void runCUDABilateral(uchar4 *h_img, int width, int height, float e_d, int radius, int iterations, int nthreads)
{
cudaBindTextureToArray(rgbaTex, d_array);
for(int i=0; i<iterations; i++)
{
dim3 gridSize((width + 16 - 1) / 16, (width + 16 - 1) / 16);
dim3 blockSize(16, 16);
d_bilateral_filter<<< gridSize, blockSize>>>( d_result, e_d, width, height, radius );
checkErrors("kernel launch");
if (iterations > 1)
{
cudaMemcpyToArray( d_tempArray, 0, 0, d_result, width * height * sizeof(float), cudaMemcpyDeviceToDevice);
cudaBindTextureToArray(rgbaTex, d_tempArray);
}
}
cudaMemcpy(h_img, d_result, width*height*sizeof(unsigned int), cudaMemcpyDeviceToHost);
checkErrors("copy data from device");
}
}
the first 3 functions are exactly frm sdk… I call runCUDABilateral() from host passing image param… initCUDA() is called first…
the last 2 errors “kernel launch” & “copy data FROM device” are showing up… no probs with “memory alloc” n “copy data TO device”…
can some1 tell me why is this happening?
thanx :)