Hi,
I am trying to use Nppi library for some image processing functions. For someone working with Delphi/Pascal for 35 years not a very simple task.
I managed to get the transpose and the multiplication functions working, but then I stumbled on ‘TemplateMatching’ as it is named in OpenCV. And I need a lot of template matching for this application, so it is really important for me.
I got the following code snippet (sorry not very sophisticated, remember I have a Pascal background) for creating a .dll function:
extern “C” __declspec(dllexport) void matchTemplate( int * b_out, unsigned char * imin,int inw, int inh,
unsigned char * tpl,int tplw,int tplh, unsigned char * resari, float* resarf, unsigned char * retim)
// b_out is a vehicle for sending some results back to the calling program, resari is the resulting uint8 bitmap, rearf the // resulting float array and retim is just to check whether the image arrived correctly in the GPU memory.
{
unsigned char * d_devbuf;
unsigned char * d_imin;
unsigned char * d_tpl;
unsigned char * d_resari;
unsigned char * d_retim;
float * d_resarf;
// all width and height parameters have been checked
// inw = 200
// inh = 150
// tpw = 40
// tph = 30
// resw = 161
// resh = 121
int resw = inw - tplw + 1;
int resh = inh - tplh + 1;
NppiSize roiin = {inw,inh};
NppiSize roitpl = {tplw,tplh};
err = cudaMalloc((unsigned char **) &d_devbuf,100000000); //should be sufficient
err = cudaMalloc((unsigned char **) &d_imin,i nw*inh);
err = cudaMalloc((unsigned char **) &d_retim, inw*inh);
b_out[0] = (int)err;
err = cudaMalloc((unsigned char **) &d_tpl, tplw*tplh);
b_out[1] = (int)err;
err = cudaMalloc((unsigned char **) &d_resari, resw*resh);
err = cudaMalloc((float **) &d_resarf, resw*resh*sizeof(float));
b_out[2] = (int)err;
err = cudaMemcpy(d_imin, imin, inw * inh,cudaMemcpyHostToDevice);
b_out[3] = (int)err;
// next statement just to check wether image has arrived in GPU (and it has)
err = cudaMemcpy(retim, d_imin, inw*inh, cudaMemcpyDeviceToHost);
err = cudaMemcpy(d_tpl, tpl, tplw * tplh,cudaMemcpyHostToDevice);
b_out[4] = (int)err;
err2 = nppiCrossCorrValid_NormLevel_8u_C1RSfs ((Npp8u *)d_imin, inw, roiin, (Npp8u *)d_tpl,tplw, roitpl,
(Npp8u *) d_resari, resw, 250,(Npp8u ) d_devbuf);
b_out[5] = (int)err2;
err = cudaMemcpy(resari, d_resari, reswresh, cudaMemcpyDeviceToHost);
b_out[6] = (int)err;
err2 = nppiCrossCorrValid_NormLevel_8u32f_C1R((Npp8u *)d_imin,inw,roiin,(Npp8u )d_tpl,tplw,roitpl,
(Npp32f ) d_resarf,reswsizeof(float),(Npp8u ) d_devbuf);
b_out[7] = (int)err2;
err = cudaMemcpy(resarf, d_resarf, reswreshsizeof(float), cudaMemcpyDeviceToHost);
b_out[8] = (int)err;
b_out[9] = 998;
}
The images imin is a small 200x150 .bmp files, tpl is taken from the same image, but a 40x30 pixels.
The 2 cross correlations should produce a result bitmap (uint8) and a float array containing the correlation coefficients.
No errors, all functions return with zero, but the result is an all zero pixel uint8 image and a 2 dimensional float array containing all 3.345508e-07 values.
What am I doing wrong?