How to get output of two arrays in CUDA?

Hi, I am using the following code to generate a mex file which I will be using later on to denoise an image using nlml function in matlab. Here, in the mex file generation code, output in two arrays value is required. But I think there is some mistake because of which it is not being able to get output in two arrays value and as a result it is generating all zero values while executing the nlml code in MATLAB using this mex file. Can anyone please tell what is the solution for this? I need a solution on how to get output in two arrays in CUDA?

#include “stdafx.h”
#include “mex.h”
#include “cuda.h”
#include “cuda_runtime.h”
#include “device_functions.h”
#include “device_launch_parameters.h”

#define BLKXSIZE 32
#define BLKYSIZE 4
#define BLKZSIZE 4

global void find_out(double dev_imData, double dev_out, double dev_out_1, const mwSize imDims, double searchSize, double neighSize)

{

unsigned idx = blockIdx.xblockDim.x + threadIdx.x;
unsigned idy = blockIdx.y
blockDim.y + threadIdx.y;
unsigned idz = blockIdx.z*blockDim.z + threadIdx.z;

int cc = idz*imDims[0]imDims[1] + idyimDims[0] + idx;

int dz = 0, dy = 0, dx = 0;
//int c = 0;
//int z = dev_size_first_for[cc];

mwSize voxelCoord[3] = { idx - 1, idy - 1, idz - 1 };
mwSize searchDims[3];
mwSize nSize[3];

mwSize minC[3];
mwSize maxC[3];

for (int i = 0; i < 3; ++i) {
nSize[i] = ((int)neighSize[i]) / 2;
int x = voxelCoord[i];
int minx = voxelCoord[i] - searchSize[i] / 2;
int maxx = minx + searchSize[i];

if (minx < nSize[i]) minx = nSize[i];
if (maxx > imDims[i] - nSize[i]) maxx = imDims[i] - nSize[i];
searchDims[i] = (maxx - minx);

minC[i] = minx;
maxC[i] = maxx;
if (maxC[i] <= minC[i]) {
//mexErrMsgTxt(“Voxel out of range.”);
return;
}
}

mwSize count = searchDims[0] * searchDims[1] * searchDims[2] - 1;
mwSize coord[3] = { voxelCoord[0] - 1, voxelCoord[1] - 1, voxelCoord[2] - 1 };

mwSize c = 0;

for (mwSize z = minC[2]; z < maxC[2]; ++z) {
for (mwSize y = minC[1]; y < maxC[1]; ++y) {
for (mwSize x = minC[0]; x < maxC[0]; ++x) {
if (x == coord[0] && y == coord[0] && z == coord[0])
continue;

unsigned int bi = zimDims[0] * imDims[1] + yimDims[0] + x;
unsigned int bj = coord[2] * imDims[0] * imDims[1] + coord[1] * imDims[0] + coord[0];

double val = dev_imData[bi];
double diff = 0.0;

int rel1 = -nSize[2] * imDims[0] * imDims[1] - nSize[1] * imDims[0] - nSize[0];
for (int dz = -nSize[2]; dz <= nSize[2]; ++dz) {
int rel2 = rel1;
for (int dy = -nSize[1]; dy <= nSize[1]; ++dy) {
int rel3 = rel2;
for (int dx = -nSize[0]; dx <= nSize[0]; ++dx) {
if (dz || dy || dx) {
double d = dev_imData[bi + rel3] - dev_imData[bj + rel3];
diff += d * d;
}
rel3++;
}
rel2 += imDims[0];
}
rel1 += imDims[0] * imDims[1];
}

//c = atomicAdd(c, 1);

dev_out[c * (imDims[0] * imDims[1] * imDims[2]) + cc] = diff;
dev_out_1[c * (imDims[0] * imDims[1] * imDims[2]) + cc] = val;
c++;
//dev_out[c] = diff;

//dev_out[c + count] = val;
}
}
}
//__syncthreads();
}

//extern “C” void find_out_wrap(dim3 numBlocks, dim3 threadsPerBlock, double *dev_size_first_for, double *dev_imData, double *dev_out, int size_imData, int size_out, mwSize minC0, mwSize minC1, mwSize maxC0, mwSize maxC1, mwSize coord0, mwSize coord1, mwSize coord2, mwSize imDims0, mwSize imDims1, mwSize nSize0, mwSize nSize1, mwSize nSize2, mwSize count)
extern “C” void find_out_wrap(double dev_imData, double dev_out, double dev_out_1, const mwSize imDims, double searchSize, double neighSize)
{
// dim3 threadsPerBlock(16);
// dim3 numBlocks((a + 15) / 16);

const dim3 blockSize(BLKXSIZE, BLKYSIZE, BLKZSIZE);
const dim3 gridSize(((imDims[0] + BLKXSIZE - 1) / BLKXSIZE), ((imDims[1] + BLKYSIZE - 1) / BLKYSIZE), ((imDims[2] + BLKZSIZE - 1) / BLKZSIZE));

// int block_size = 32;
// int n_blocks = a / block_size + (a%block_size == 0 ? 0 : 1);

find_out <<<gridSize, blockSize >>>(dev_imData, dev_out, dev_out_1, imDims, searchSize, neighSize);

}