hey guys,
i need some help with this.
im trying to get this filter kernel running, but it is not working…
[codebox]
#ifndef FILTER_KERNEL_H
#define FILTER_KERNEL_H
#define TILE_W 16
#define TILE_H 16
#define R 1 // Filter radius
#define D (R*2 + 1) // Filter diameter
#define S (D*D) // Filter size
#define BLOCK_W (TILE_W+(2*R))
#define BLOCK_H (TILE_H+(2*R))
global void filter(float *odata, float *idata, int width, int height)
{
__shared__ float smem[BLOCK_W*BLOCK_H];
int x = blockIdx.x*TILE_W + threadIdx.x - R;
int y = blockIdx.y*TILE_H + threadIdx.y - R;
//clamp to edge of image
x = max(0,x);
x = min(x,width-1);
y = max(y,0);
y = min(y,height-1);
unsigned int index = y*width + x;
unsigned int bindex = threadIdx.y * blockDim.y + threadIdx.x;
//each thread copies its pixel of the block to shared memory
smem[bindex] = idata[index];
__syncthreads();
//only threads inside apron will write results
if ( (threadIdx.x >= R) && (threadIdx.x < BLOCK_W-R) && (threadIdx.y >= R) && (threadIdx.y < BLOCK_W-R) )
{
float sum=0;
for (int dy=-R; dy<=R; dy++)
{
for (int dx=-R; dx<=R; dx++)
{
float i = smem[bindex + (dy*blockDim.x) + dx];
sum += i;
}
}
odata[index]=sum/S;
}
}
#endif // FILTER_KERNEL_H
[/codebox]
my questions:
whats my block dimension? Is it supposed to be (BLOCK_W,BLOCK_H)?
whats my grid dimension?
lets assume my input picture is the size 640x480, whats the size of my output picture? 638x478 cause of the apron (R 1)?
what i want to do is, take a 640x480 grayscale picture add the apron ( picture is now the size 642x482 ) filter it with this kernel
and have an output picture, which is 640x480 again.
any help or explanation is greatly appreciated.
thanks
Flo
here’s the rest of my code.
[codebox]// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
// includes, project
#include <cutil_inline.h>
// includes, kernels
#include <filter_kernel.cu>
////////////////////////////////////////////////////////////////////////////////
void runTest( int argc, char** argv);
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int
main( int argc, char** argv)
{
runTest( argc, argv);
cutilExit(argc, argv);
}
////////////////////////////////////////////////////////////////////////////////
void
runTest( int argc, char** argv)
{
// size of the matrix
const unsigned int size_x = 72;
const unsigned int size_y = 36;
// size of memory required to store the matrix
const unsigned int mem_size = sizeof(float) * size_x * size_y;
unsigned int timer;
cutCreateTimer(&timer);
// use command-line specified CUDA device, otherwise use device with highest Gflops/s
if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") )
cutilDeviceInit(argc, argv);
else
cudaSetDevice( cutGetMaxGflopsDeviceId() );
// allocate host memory
float* h_idata = (float*) malloc(mem_size);
// initalize the memory
for( unsigned int i = 0; i < (size_x * size_y); ++i)
{
h_idata[i] = 9.; // rand();
}
//printing input matrix
for (int j=0; j<((size_x)*(size_y)); j++)
{
if (j%(size_x)==0) printf("\n");
printf(" %d ", (int) h_idata[j]);
}
printf("\n");
// allocate device memory
float* d_idata;
float* d_odata;
cutilSafeCall( cudaMalloc( (void**) &d_idata, mem_size));
cutilSafeCall( cudaMalloc( (void**) &d_odata, mem_size));
// copy host memory to device
cutilSafeCall( cudaMemcpy( d_idata, h_idata, mem_size,
cudaMemcpyHostToDevice) );
// setup execution parameters
dim3 grid(size_x / BLOCK_W, size_y / BLOCK_H, 1);
dim3 threads(BLOCK_W, BLOCK_H, 1);
// warmup so we don’t time CUDA startup
filter<<< grid, threads >>>(d_odata, d_idata, size_x, size_y);
// synchronize here, so we make sure that we don't count any time from the asynchronize kernel launches.
cudaThreadSynchronize();
cutStartTimer(timer);
filter<<< grid, threads >>>(d_odata, d_idata, size_x, size_y);
cudaThreadSynchronize();
cutStopTimer(timer);
float averageTime = cutGetTimerValue(timer);
printf(“average time: %0.3f ms\n\n”, averageTime);
// check if kernel execution generated and error
cutilCheckMsg("Kernel execution failed");
// copy result from device to host
float* h_odata = (float*) malloc(mem_size);
cutilSafeCall( cudaMemcpy( h_odata, d_odata, mem_size,
cudaMemcpyDeviceToHost) );
//printing output matrix
for (int j=0; j<((size_x-2)*(size_y-2)); j++)
{
if (j%(size_x-2)==0) printf("\n");
printf(" %0.2f ", h_odata[j]);
}
// cleanup memory
free(h_idata);
free(h_odata);
cutilSafeCall(cudaFree(d_idata));
cutilSafeCall(cudaFree(d_odata));
cutilCheckError( cutDeleteTimer(timer));
cudaThreadExit();
}
[/codebox]