Hi,

I want to perform the computations below:

- Sum of an array --> through mexfunction #1
- Square of elements of an array and then compute the sum of squares of array elements --> through mexfunction #2

To implement this logic, the matlab script below is used:

```
dataA = [0.5:1:10.5]
full_sum = sum(dataA);
% Mexfunction # 1
mexcuda -v mexfn_sumdata.cu sum_data.cu
mexfn_sumdata(dataA);
% Mexfunction # 2
mexcuda -v mexfn_squaresumdata.cu
mexfn_squaresumdata;
```

Related header file “sum_data.h”

```
void compute_add(double * const d_sum,
double const * const d_subimageF);
__global__ void Compute_Sum(double * const out,
double const * const in);
static double *Arg;
```

Mex function # 1: (mexfn_sumdata.cu)

```
#include "mex.h"
#include "cuda.h"
#include "sum_data.h"
const int NUM = 10;
void mexFunction(int nlhs, mxArray *plhs[],
int nrhs, mxArray const *prhs[])
{
double *val;
cudaMalloc ((void **)&Arg, NUM * sizeof(Arg[0]));
val = (double*) mxGetData(prhs[0]);
cudaMemcpy(Arg, val, NUM*sizeof(double), cudaMemcpyHostToDevice);
printf("\n Inside first mex function \n");
for (int jj=0;jj<NUM;jj++){
printf("\n test[%d]=%f\n",jj,val[jj]);
}
double *h_sum_new = (double*) malloc(1*sizeof(double));
h_sum_new[0] = 0.0f;
double *d_sum;
cudaMalloc ((void **)&d_sum, 1 * sizeof(Arg[0]));
cudaMemcpy(d_sum,&h_sum_new,1*sizeof(double),cudaMemcpyHostToDevice);
Compute_Sum <<<1,1>>> (d_sum,Arg);
cudaDeviceSynchronize();
cudaMemcpy(h_sum_new, d_sum, 1*sizeof(double), cudaMemcpyDeviceToHost);
printf("\n Sum of sub-image[0] = %f \n",h_sum_new[0]);
}
```

CUDA source code: “sum_data.cu”

```
const int NUM = 10;
void __global__ Compute_Sum(double * const out1,
double const * const in){
for (int ii=0;ii<NUM;ii++){
out1[0] = out1[0] + in[ii];
printf("\n threadid = %d \t out1[%d] = %f \n", threadIdx.x, out1[0]);
}
}
void compute_add(double * const d_sum,
double const * const d_subimageF)
{
double *h_sum_new = (double*) malloc(NUM*sizeof(double));
h_sum_new[0] = 0.0f;
cudaMemcpy(d_sum,&h_sum_new,1*sizeof(double),cudaMemcpyHostToDevice);
Compute_Sum <<<1,1>>> (d_sum,d_subimageF);
cudaDeviceSynchronize();
cudaMemcpy(h_sum_new, d_sum, 1*sizeof(double), cudaMemcpyDeviceToHost);
printf("\n Sum = %f \n\n\n",h_sum_new[0]);
}
```

Mex function # 2: (mexfn_squaresumdata.cu)

```
#include "mex.h"
#include "cuda.h"
#include "sum_data.h"
const int NUM = 10;
void mexFunction(int nlhs, mxArray *plhs[],
int nrhs, mxArray const *prhs[])
{
double *val_new = (double*)malloc(NUM*sizeof(double));
cudaMemcpy(val_new,Arg,NUM*sizeof(double), cudaMemcpyDeviceToHost);
printf("\n Inside second mex function \n");
for (int jj=0;jj<NUM;jj++){
printf("\n test_new[%d]=%f\n",jj,val_new[jj]);
}
}
```

Problem faced: The data stored in pointer ‘Arg’ is not being accessed from mexfunction #2. Upon printing, the output is zero for all elements.

Inside second mex function

test_new[0]=0.000000

…

test_new[8]=0.000000

test_new[9]=0.000000

Can anyone please suggest why the data is not accessed correctly?

Thanks.