CUDA SDK Boxfilter examlpe how to use boxfilter functions?

Hi all,

I want to test the boxfilter sample in the CUDA 3.1 SDK.

I need a help in the following:
1 - i tested the program with LenaRGB image and the results are OK. But, when i used my own image, the result of filtering appeared to has some extra rows in the bottom of the image (please, see the attached images). where is the problem?

2 - i want to test the function boxfilter (not boxfilterRGBA) with the following code:

width = 10;
height = 10;
float * h_img_float = NULL;
float * d_img_float = NULL;
float * d_temp_float = NULL;
float *d_result_float;

float *h_img_float = (float *)malloc(width * height * sizeof(float));
for(int y = 0; y < height; y++)
{
for(int x = 0; x < width; x++)
{
h_img_float[y * width + x] = y * 0.1;
printf("%f\t", h_img_float[y * width + x]);
}
printf("\n");
}

cutilSafeCall(cudaMalloc((void**)&d_img_float, (width * height * sizeof(float))));
cutilSafeCall(cudaMalloc((void**)&d_temp_float, (width * height * sizeof(float))));
cutilSafeCall(cudaMalloc((void **)&d_result_float, (width * height * sizeof(float))));
initTexture(width, height, h_img_float);

float *h_dest = (float *)malloc(width * height * sizeof(float));

double d = boxFilter(d_img_float, d_temp_float, d_result_float, width, height, filter_radius, iterations, nthreads);
cutilSafeCall(cudaMemcpy(h_dest, d_result_float, width * height * sizeof(float), cudaMemcpyDeviceToHost));

printf(“d = %f\n”, d);

for(int y = 0; y < height; y++)
{
for(int x = 0; x < width; x++)
{
float val = h_dest[y * width + x];
printf("%f\t", val);
}
printf("\n");
}

cleanup();
cutilSafeCall(cudaFree(d_result_float));

the printed values of the h_dest are very strange. also, where is the problem?

i am very sorry for the long message, but really, i need fst help.
thanks.
Ghada.

HI this is my code.
I think it can help you!
void NppiBoxFilter8uC1R(const unsigned char* srcCuda, unsigned char* dstCuda, int width, int height, int KerSizeX, int KerSizeY)
{
int nSrcPitch;
clock_t start = clock();
Npp8u* pSrcImage = nppiMalloc_8u_C1(width, height, &nSrcPitch);
NPP_ASSERT_NOT_NULL(pSrcImage);
// copy image loaded via FreeImage to into CUDA device memory, i.e.
// transfer the image-data up to the GPU’s video-memory
NPP_CHECK_CUDA(cudaMemcpy2D(pSrcImage, nSrcPitch, srcCuda, width, width, height, cudaMemcpyHostToDevice));
clock_t end = clock();
double elapsed_time = (end - start) / (double)CLOCKS_PER_SEC;

printf("cudaMemcpy2D seconds\n", elapsed_time);


int nDstPitch;
Npp8u* pDstImage = nppiMalloc_8u_C1(width, height, &nDstPitch);
NPP_ASSERT_NOT_NULL(pDstImage);
// copy image loaded via FreeImage to into CUDA device memory, i.e.
// transfer the image-data up to the GPU's video-memory
NPP_CHECK_CUDA(cudaMemcpy2D(pDstImage, nDstPitch, dstCuda, width, width, height, cudaMemcpyHostToDevice));

// create struct with box-filter mask size
NppiSize oMaskSize = { KerSizeX, KerSizeY };

NppiSize oSrcSize = { (int)width, (int)height };
NppiPoint oSrcOffset = { 0, 0 };

// create struct with ROI size
NppiSize oSizeROI = { (int)width , (int)height };
// allocate device image of appropriately reduced size

// set anchor point inside the mask to (oMaskSize.width / 2, oMaskSize.height / 2)
// It should round down when odd
NppiPoint oAnchor = { oMaskSize.width / 2, oMaskSize.height / 2 };

 start = clock();
NppStatus status = nppiFilterBoxBorder_8u_C1R(pSrcImage, nSrcPitch,
	oSrcSize, oSrcOffset, pDstImage, nDstPitch,
	oSizeROI, oMaskSize, oAnchor, NPP_BORDER_REPLICATE);
cudaDeviceSynchronize();
 end = clock();
 elapsed_time = (end - start) / (double)CLOCKS_PER_SEC;

printf("nppiFilterBoxBorder_8u_C1R seconds\n", elapsed_time);

if (status != NPP_SUCCESS)
	throw(status);
 start = clock();
// cudaMemcpy(dstCuda, pDstImage, sizeof(unsigned char) * width* height, cudaMemcpyDeviceToHost);

NPP_CHECK_CUDA(cudaMemcpy2D(dstCuda, width, pDstImage, nDstPitch, width, height, cudaMemcpyDeviceToHost));
 end = clock();
elapsed_time = (end - start) / (double)CLOCKS_PER_SEC;

printf("cudaMemcpy2D seconds\n", elapsed_time);

nppiFree(pSrcImage);
nppiFree(pDstImage);

}