Hi, I am using the following code to generate a mex file which I will be using later on to denoise an image using nlml function in matlab. Here, in the mex file generation code, output in two arrays value is required. But I think there is some mistake because of which it is not being able to get output in two arrays value and as a result it is generating all zero values while executing the nlml code in MATLAB using this mex file. Can anyone please tell what is the solution for this? I need a solution on how to get output in two arrays in CUDA?
#include “stdio.h”
#include “stdlib.h”"
#include “stdafx.h”
#include “mex.h”
#include
#include “math.h”
#include “malloc.h”
#include<cuda.h>
#include <cuda_runtime.h>
#include “stdio.h”
#include “conio.h”
#include <stdint.h>
// input:
// array
// voxel coordinate
// search size
// neighbourhood size
//
// output:
// array of size neighbourhood by 2, with sum of squares, values
// input:
// array
// voxel coordinate
// search size
// neighbourhood size
//
// output:
// array of size neighbourhood by 2, with sum of squares, values
//extern “C” void find_out_wrap(dim3, dim3, double*, double*, double*, int, int, mwSize, mwSize, mwSize, mwSize, mwSize, mwSize, mwSize, mwSize, mwSize, mwSize, mwSize, mwSize, mwSize);
extern “C” void find_out_wrap(double , double , double , const mwSize, double, double);
// input:
// array
// voxel coordinate
// search size
// neighbourhood size
//
// output:
// array of size neighbourhood by 2, with sum of squares, values
void mexFunction(int nlhs, mxArray* plhs,
int nrhs, const mxArray* prhs)
{
mwSize dimCount = mxGetNumberOfDimensions(prhs[0]);
if (nrhs != 3) {
mexErrMsgTxt("Arguments: volume, search size, neighbourhood size");
return;
}
if (dimCount != 3) {
mexErrMsgTxt("Data must be 3D array.");
return;
}
if (!mxIsDouble(prhs[0]) || !mxIsDouble(prhs[1]) || !mxIsDouble(prhs[2])) {
mexErrMsgTxt("Data must be double.");
return;
}
if (nlhs != 1) {
mexErrMsgTxt("Need exactly one output.");
return;
}
const mwSize* imDims = mxGetDimensions(prhs[0]);
double* imData = mxGetPr(prhs[0]);
// TODO: check prhs[1], prhs[2], prhs[3] have 3 elements,
double* searchSize = mxGetPr(prhs[1]);
double* neighSize = mxGetPr(prhs[2]);
mwSize win_dim = (2 * searchSize[0] + 1) * (2 * searchSize[1] + 1) * (2 * searchSize[2] + 1);
mwSize mult = imDims[0] * imDims[1] * imDims[2];
//plhs[0] = mxCreateDoubleMatrix(mult, win_dim, mxREAL);
//plhs[0] = mxCreateNumericMatrix(1, mult*win_dim, mxINT32_CLASS, mxREAL);
// double* out = (double*)mxGetPr(plhs[0]);
// double* out_1 = (double*)mxGetPr(plhs[0]);
double* out;
double* out_1;
// double* out = mxGetPr(plhs[0]);
// mwSize *size_first_for = (mwSize *)malloc(sizeof(mwSize) * a);
// int i = 0;
// for (mwSize z = minC[2]; z < maxC[2]; ++z) {
// size_first_for[i] = z;
// i = i + 1;
// }
//exit(0);
int size_imData = sizeof(imData) / sizeof(double);
double *dev_imData, *dev_out, *dev_out_1, *dev_size_first_for;
cudaHostAlloc((void**)&imData, size_imData*sizeof(double), cudaHostAllocMapped);
//cudaCheckErrors("Failed to allocate device buffer");
cudaHostAlloc((void**)&out,imDims[0]*imDims[1]*imDims[2]*win_dim*sizeof(double), cudaHostAllocMapped);
//cudaCheckErrors("Failed to allocate device buffer");
cudaHostAlloc((void**)&out_1,imDims[0]*imDims[1]*imDims[2]*win_dim*sizeof(double), cudaHostAllocMapped);
//cudaCheckErrors("Failed to allocate device buffer");
cudaHostGetDevicePointer((void **)&dev_imData, imData, 0);
cudaHostGetDevicePointer((void **)&dev_out, out, 0);
cudaHostGetDevicePointer((void **)&dev_out_1, out_1, 0);
//find_out_wrap(numBlocks, threadsPerBlock, dev_size_first_for, dev_imData, dev_out, size_imData, size_out, minC[0], minC[1], maxC[0], maxC[1], coord[0], coord[1], coord[2], imDims[0], imDims[1], nSize[0], nSize[1], nSize[2], count);
find_out_wrap(dev_imData, dev_out, dev_out_1, imDims, searchSize, neighSize);
//cudaMemcpy(out, d_out, ((l_win*l_win*l_win)*sizeof(int)), cudaMemcpyDeviceToHost);
// cudaCheckErrors("CUDA memcpy failure");
//cudaMemcpy(out, d_out, ((l_win*l_win*l_win)*sizeof(int)), cudaMemcpyDeviceToHost);
// cudaMemcpy(out_1, d_out_1, ((nx*ny*nz*win_dim)*sizeof(int)), cudaMemcpyDeviceToHost);
// cudaCheckErrors("CUDA memcpy failure");
plhs[0] = mxCreateDoubleMatrix(mult, win_dim, mxREAL);
// double* h_Campo_scalato_re = mxGetPr(plhs[0]);
// double* h_Campo_scalato_im = mxGetPi(plhs[0]);
double* out1 = (double*)mxGetPr(plhs[0]);
//double* out1_1 = (double*)mxGetPr(plhs[0]);
cudaMemcpy(out, dev_out, (imDims[0] * imDims[1] * imDims[2] * win_dim), cudaMemcpyHostToHost);
//out1_1 = out_1;
cudaFree(dev_imData);
cudaFree(dev_out);
cudaFree(dev_out_1);
}