Dear All,

I have a written a kernel which is not working properly, I could not figured out why. Please help me. The following is my code and code summary.

Code Summary:

Input: Mx256 matrix and 256x1 matrix

Output1: Mx256 matrix containing the elementwise minimum values of 1x256 ans 256x1 matrices for each M

Output2: a Mx256 matrix containing the elementwise **square of the differences** of each 1x256 and 256x1 matrices for each row M.

Example:

**Example: **

A=rand(50,256);

B=rand(1,256);

[H D]=HMG1(A,B)

```
#include<stdio.h>
#include<cuda.h>
#include "cuda_runtime.h"
#include<math.h>
#include"mex.h"
#include"matrix.h"
__global__ void hMatchG(float *xr, float *qr, float *h, float *d,int N1){
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = threadIdx.x;
__shared__ float a;
__shared__ float b;
__shared__ float p;
a=xr[i];
b=qr[j];
__syncthreads();
if (i<N1 && j<256 ) {
//h[i]=(a<b?a:b);
h[i]=fminf(a,b);
p=abs(a-b);
d[i]=p*p;
}
__syncthreads();
}
void mexFunction(int nlhs,mxArray *plhs[],int nrhs, const mxArray *prhs[])
{
float *hr,*dr,*xr,*qr,*id1,*id2,*od1,*od2,isize1,isize2,osize;
int mi1,ni1,N1,mi2,ni2,N2;
mi1=mxGetM(prhs[0]);
ni1=mxGetN(prhs[0]);
N1=mi1*ni1;
plhs[0] = mxCreateDoubleMatrix(mi1, ni1, mxREAL);
plhs[1] = mxCreateDoubleMatrix(mi1, ni1, mxREAL);
if(plhs[0]==NULL && plhs[1]==NULL) mexErrMsgTxt("Could not create mxArray");
mi2=mxGetM(prhs[1]);
ni2=mxGetN(prhs[1]);
N2=mi2*ni2;
id1=(float *)mxGetPr(prhs[0]);
id2=(float *)mxGetPr(prhs[1]);
od1=(float *)mxGetPr(plhs[0]);
od2=(float *)mxGetPr(plhs[1]);
isize1=N1*sizeof(float);
isize2=N2*sizeof(float);
osize=N1*sizeof(float);
cudaMalloc((void **) &xr, isize1);
cudaMemcpy(xr, id1, isize1, cudaMemcpyHostToDevice);
cudaMalloc((void **) &qr, isize2);
cudaMemcpy(qr, id2, isize2, cudaMemcpyHostToDevice);
cudaMalloc((void **) &hr, osize);
cudaMalloc((void **) &dr, osize);
int block_size=256;
int n_blocks=N1/block_size;
hMatchG<<<n_blocks,block_size>>>(xr,qr,hr, dr, N1);
// cudaThreadSynchronize();
cudaMemcpy(od1,hr, osize, cudaMemcpyDeviceToHost);
cudaMemcpy(od2,dr, osize, cudaMemcpyDeviceToHost);
for(int i=0;i<N1;i++)
mexPrintf("%f %f \n",od1[i],od2[i]) ;
cudaFree(xr);
cudaFree(qr);
cudaFree(hr);
cudaFree(dr);
}
```

Please help me.