NPP TEXTURE BIND ERROR NppiFilter_8u_C1R error

Hi,

I am focused to filter an image with NppiFilter_8u_C1 but I have some troubles; when NppiFilter_8u_C1 is called NppStatus is setted to -24 (NPP_TEXTURE_BIND_ERROR). I cannot figure out what is wrong with my code, this is my snippet:

#include "cuda_runtime.h"

#include "device_launch_parameters.h"

#include <nppdefs.h>

#include <nppcore.h>

#include <npp.h>

#include <nppi.h>

#include <npps.h>

#include <stdio.h>

#include <iostream>

using namespace std;

#define LENGTH 10

#define KERNEL 5

void main(char *str[])

{

	Npp8u 

		*hImage = (Npp8u *)malloc(LENGTH * LENGTH * sizeof(Npp8u)),

		*hOutput = (Npp8u *)malloc(LENGTH * LENGTH * sizeof(Npp8u));

	Npp8u

		*dImage,

		*dOutput;

	Npp32s

		*hKernel = (Npp32s *)malloc(KERNEL * KERNEL * sizeof(Npp32s)),

		*dKernel;

	size_t

		pImage,

		pOutput;

	NppiSize

		sizeImage,

		sizeKernel;

	

	sizeImage.height =

		LENGTH;

	sizeImage.width =

		LENGTH;

	sizeKernel.height =

		KERNEL;

	sizeKernel.width =

		KERNEL;

	for( int i = 0; i < LENGTH; i++ )

		for( int j = 0; j < LENGTH; j++)

			hImage[i * LENGTH + j] = i;

	for( int i = 0 ; i < KERNEL; i++)

		for( int j = 0; j < LENGTH; j++)

			hKernel[i * KERNEL + j] = 2;

	cudaMalloc<Npp32s>((Npp32s **)&dKernel, KERNEL * KERNEL * sizeof(Npp32s));

	cudaMallocPitch<Npp8u>((Npp8u **)&dImage, &pImage, LENGTH, LENGTH);

	cudaMallocPitch<Npp8u>((Npp8u **)&dOutput, &pOutput, LENGTH, LENGTH);

	cudaMemcpy2D( dKernel, KERNEL * sizeof(Npp32s), hKernel, KERNEL * sizeof(Npp32s), KERNEL, KERNEL, cudaMemcpyHostToDevice );

	cudaMemcpy2D( dImage, pImage, hImage, LENGTH * sizeof(Npp8u), LENGTH, LENGTH, cudaMemcpyHostToDevice );

	NppiPoint k;

	k.x = 

		0;

	k.y = 

		0;

	NppStatus p =

		nppiFilter_8u_C1R(dImage, pImage, dOutput, pOutput, sizeImage, dKernel, sizeKernel, k, 1);

	cout << "NppSatus: " << p << "\n";

	cudaMemcpy2D( hOutput, LENGTH * sizeof(Npp8u), dOutput, pOutput, LENGTH, LENGTH, cudaMemcpyDeviceToHost );

	cout << "Kernel" << "\n";

	for( int i = 0; i < KERNEL; i++ )

	{

		for( int j = 0; j < KERNEL; j++)

		{

			cout << (int)hKernel[i * KERNEL + j] << " ";

		}

		cout << "\n";

	}

	cout << "Image" << "\n";

	for( int i = 0; i < LENGTH; i++ )

	{

		for( int j = 0; j < LENGTH; j++)

		{

			cout << (int)hImage[i * LENGTH + j] << " ";

		}

		cout << "\n";

	}

	cudaMemcpy2D( hKernel, KERNEL * sizeof(Npp32s), dKernel, KERNEL * sizeof(Npp32s), KERNEL, KERNEL, cudaMemcpyDeviceToHost );

	cout << "\nKernel" << "\n";

	for( int i = 0; i < KERNEL; i++ )

	{

		for( int j = 0; j < KERNEL; j++)

		{

			cout << (int)hKernel[i * KERNEL + j] << " ";

		}

		cout << "\n";

	}

	cudaMemcpy2D( hImage, LENGTH * sizeof(Npp8u), dImage, pImage, LENGTH, LENGTH, cudaMemcpyDeviceToHost );

	cout << "Image" << "\n";

	for( int i = 0; i < LENGTH; i++ )

	{

		for( int j = 0; j < LENGTH; j++)

		{

			cout << (int)hImage[i * LENGTH + j] << " ";

		}

		cout << "\n";

	}

	cout << "Output" << "\n";

	for( int i = 0; i < LENGTH; i++ )

	{

		for( int j = 0; j < LENGTH; j++)

		{

			cout << (int)hOutput[i * LENGTH + j] << " ";

		}

		cout << "\n";

	}

	getchar();

}

Please help me

Nobody??

Should I leave NPP library and move to customize kernel?? please help me.