How to get output of two arrays in CUDA?

Hi, I am using the following code to generate a mex file which I will be using later on to denoise an image using nlml function in matlab. Here, in the mex file generation code, output in two arrays value is required. But I think there is some mistake because of which it is not being able to get output in two arrays value and as a result it is generating all zero values while executing the nlml code in MATLAB using this mex file. Can anyone please tell what is the solution for this? I need a solution on how to get output in two arrays in CUDA?

#include “stdio.h”
#include “stdlib.h”"
#include “stdafx.h”
#include “mex.h”
#include
#include “math.h”
#include “malloc.h”
#include<cuda.h>
#include <cuda_runtime.h>
#include “stdio.h”
#include “conio.h”
#include <stdint.h>

// input:
// array
// voxel coordinate
// search size
// neighbourhood size
//
// output:
// array of size neighbourhood by 2, with sum of squares, values

// input:
// array
// voxel coordinate
// search size
// neighbourhood size
//
// output:
// array of size neighbourhood by 2, with sum of squares, values

//extern “C” void find_out_wrap(dim3, dim3, double*, double*, double*, int, int, mwSize, mwSize, mwSize, mwSize, mwSize, mwSize, mwSize, mwSize, mwSize, mwSize, mwSize, mwSize, mwSize);
extern “C” void find_out_wrap(double , double , double , const mwSize, double, double);

// input:
// array
// voxel coordinate
// search size
// neighbourhood size
//
// output:
// array of size neighbourhood by 2, with sum of squares, values

void mexFunction(int nlhs, mxArray* plhs,
int nrhs, const mxArray* prhs)
{
mwSize dimCount = mxGetNumberOfDimensions(prhs[0]);

if (nrhs != 3) {
	mexErrMsgTxt("Arguments: volume, search size, neighbourhood size");
	return;
}
if (dimCount != 3) {
	mexErrMsgTxt("Data must be 3D array.");
	return;
}
if (!mxIsDouble(prhs[0]) || !mxIsDouble(prhs[1]) || !mxIsDouble(prhs[2])) {
	mexErrMsgTxt("Data must be double.");
	return;
}
if (nlhs != 1) {
	mexErrMsgTxt("Need exactly one output.");
	return;
}

const mwSize* imDims = mxGetDimensions(prhs[0]);
double* imData = mxGetPr(prhs[0]);

// TODO: check prhs[1], prhs[2], prhs[3] have 3 elements, 

double* searchSize = mxGetPr(prhs[1]);
double* neighSize = mxGetPr(prhs[2]);

mwSize win_dim = (2 * searchSize[0] + 1) * (2 * searchSize[1] + 1) * (2 * searchSize[2] + 1);

mwSize mult = imDims[0] * imDims[1] * imDims[2];

//plhs[0] = mxCreateDoubleMatrix(mult, win_dim, mxREAL);

//plhs[0] = mxCreateNumericMatrix(1, mult*win_dim, mxINT32_CLASS, mxREAL);

// double* out = (double*)mxGetPr(plhs[0]);
// double* out_1 = (double*)mxGetPr(plhs[0]);

double* out;
double* out_1;

// double* out = mxGetPr(plhs[0]);

// mwSize *size_first_for = (mwSize *)malloc(sizeof(mwSize) * a);

// int i = 0;

// for (mwSize z = minC[2]; z < maxC[2]; ++z) {
// size_first_for[i] = z;
// i = i + 1;
// }

//exit(0);

int size_imData = sizeof(imData) / sizeof(double);

double *dev_imData, *dev_out, *dev_out_1, *dev_size_first_for;

cudaHostAlloc((void**)&imData, size_imData*sizeof(double), cudaHostAllocMapped);
//cudaCheckErrors("Failed to allocate device buffer");

cudaHostAlloc((void**)&out,imDims[0]*imDims[1]*imDims[2]*win_dim*sizeof(double), cudaHostAllocMapped);
//cudaCheckErrors("Failed to allocate device buffer");

cudaHostAlloc((void**)&out_1,imDims[0]*imDims[1]*imDims[2]*win_dim*sizeof(double), cudaHostAllocMapped);
//cudaCheckErrors("Failed to allocate device buffer");

cudaHostGetDevicePointer((void **)&dev_imData, imData, 0);
cudaHostGetDevicePointer((void **)&dev_out, out, 0);
cudaHostGetDevicePointer((void **)&dev_out_1, out_1, 0);

//find_out_wrap(numBlocks, threadsPerBlock, dev_size_first_for, dev_imData, dev_out, size_imData, size_out, minC[0], minC[1], maxC[0], maxC[1], coord[0], coord[1], coord[2], imDims[0], imDims[1], nSize[0], nSize[1], nSize[2], count);

find_out_wrap(dev_imData, dev_out, dev_out_1, imDims, searchSize, neighSize);

//cudaMemcpy(out, d_out, ((l_win*l_win*l_win)*sizeof(int)), cudaMemcpyDeviceToHost);

//        cudaCheckErrors("CUDA memcpy failure");

//cudaMemcpy(out, d_out, ((l_win*l_win*l_win)*sizeof(int)), cudaMemcpyDeviceToHost);
//        cudaMemcpy(out_1, d_out_1, ((nx*ny*nz*win_dim)*sizeof(int)), cudaMemcpyDeviceToHost);
//        cudaCheckErrors("CUDA memcpy failure");

plhs[0] = mxCreateDoubleMatrix(mult, win_dim, mxREAL);
//   double* h_Campo_scalato_re = mxGetPr(plhs[0]);
//   double* h_Campo_scalato_im = mxGetPi(plhs[0]);

double* out1 = (double*)mxGetPr(plhs[0]);
//double* out1_1 = (double*)mxGetPr(plhs[0]);

cudaMemcpy(out, dev_out, (imDims[0] * imDims[1] * imDims[2] * win_dim), cudaMemcpyHostToHost);
//out1_1 = out_1;

cudaFree(dev_imData);
cudaFree(dev_out);
cudaFree(dev_out_1);

}

“it is generating all zero values while executing the nlml code in MATLAB using this mex file”

just make sure to verify that the device is done (with the arrays), before using the arrays

i see no cudaDeviceSynchronize() above

plhs[0] = mxCreateDoubleMatrix(mult, win_dim, mxREAL);
// double* h_Campo_scalato_re = mxGetPr(plhs[0]);
// double* h_Campo_scalato_im = mxGetPi(plhs[0]);

double* out1 = (double*)mxGetPr(plhs[0]);
//double* out1_1 = (double*)mxGetPr(plhs[0]);

cudaMemcpy(out, dev_out, (imDims[0] * imDims[1] * imDims[2] * win_dim), cudaMemcpyHostToHost);
//out1_1 = out_1;

is this part correct??? @little_jimmy

Please use code blocks for the code and there is no need to put the same thread topic in multiple forums.

Your problem appears to be that you only declare one single pointer for a return value for the MATLAB mex, but want two separate result vectors. Here is an example of the correct way from one my older CUDA mex files:

//create answer for Matlab and copy back vectors u and z
	plhs[0]=mxCreateNumericMatrix(BigCols,num_lambdas,mxSINGLE_CLASS,mxREAL);
	plhs[1]=mxCreateNumericMatrix(BigCols,num_lambdas,mxSINGLE_CLASS,mxREAL);

	float *u_result=(float *)mxGetPr(plhs[0]);
	float *z_result=(float *)mxGetPr(plhs[1]);

	cudaMemcpy(u_result,D_u,numbytesVC,_DTH);//copy  u info back to host
	cudaMemcpy(z_result,D_z,numbytesVC,_DTH);//copy  z info back to host

the in your MATLAB script you make sure to have two left hand side vectors to fill like this:

[result0 result1]= CUDAMEX();

Also your copy from the CUDA device buffer to the MATLAB host buffer is a cudaMemcpyDeviceToHost.

much of this information is possible to find via Google. I seriously am wondering why so few on here do not use Google first.