Unhandled exception at 0x00007FFB8367A388 :: Microsoft C++ exception: thrust::system::system_error a...

I have a program with some parts containing C functions and some parts containing kernel that I have used in my C functions ampersands(&) because we want new values to replace the old values in the number of duplicates in the loops. I did not directly use the ampersands(&) in the kernel, but the values that placed to argument C functions using this ampersand (&) copied to the kernel variable also include this link which is related to the program problem
https://stackoverflow.com/questions/33512284/thrust-exception-thrustsystemsystem-error-at-memory-location-0x00000000
but I don’t know how to replace new values with old ones without getting this error. could anyone help me?

void encoder(const vector&x_train, int row_Xtrain, int col_Xtrain,const vector< double> &a_f, int row_af, int col_af, const vector&b_f, vector< double>& H_F, std::string G){
double x_a = (double)malloc(row_Xtrain * col_af * sizeof(double));
double x_a_b = (double)malloc(row_Xtrain * col_af* sizeof(double));
//
double *d_x_train, *d_a_f, *d_b_f, *d_x_a, *d_x_a_b , *d_H_F;
cudaMalloc((void *)&d_x_train, sizeof(double) x_train.size());
cudaMalloc((void *)&d_a_f, sizeof(double) a_f.size());
cudaMalloc((void **)&d_b_f, sizeof(double)*NERUN);
cudaMalloc((void *)&d_x_a, sizeof(double) row_Xtrain * col_af);
cudaMalloc((void **)&d_x_a_b, sizeof(double)*row_Xtrain * col_af );
cudaMalloc((void **)&d_H_F, sizeof(double)*row_Xtrain * col_af);

// transfer data from host to device
cudaMemcpy(d_x_train, x_train.data(), sizeof(double)* x_train.size(), cudaMemcpyHostToDevice);
cudaMemcpy(d_a_f, a_f.data(), sizeof(double)* a_f.size(), cudaMemcpyHostToDevice);
cudaMemcpy(d_b_f, b_f.data(), sizeof(double)* NERUN, cudaMemcpyHostToDevice);
//////////////////////////////////////////------------call funcation-----------//////////////////////////////////////////
multi(d_x_train, d_a_f, d_x_a, row_Xtrain, col_Xtrain, row_af, col_af, row_Xtrain, col_af);
//test
cudaMemcpy(x_a, d_x_a, sizeof(double)NERUNNERUN, cudaMemcpyDeviceToHost);
SUM(d_x_a, d_b_f, d_x_a_b, row_Xtrain * col_af, row_Xtrain, col_af);
//test
cudaMemcpy(x_a_b, d_x_a_b, sizeof(double)row_Xtrain * col_af, cudaMemcpyDeviceToHost);
if (G == “sigmoid”){
//sigmoid
Sigmoid(d_x_a_b, d_H_F, row_Xtrain * col_af);
}
if (G == “sinus”){
//sin
sinus(d_x_a_b, d_H_F, row_Xtrain * col_af);
}
/////////////////////////////////////////-----------------end---------------------///////////////////////////////////////
// device
cudaMemcpy(H_F.data(), d_H_F, sizeof(double)
row_Xtrain * col_af, cudaMemcpyDeviceToHost);
//
cudaFree(d_x_train);
cudaFree(d_a_f);
cudaFree(d_b_f);
cudaFree(d_x_a);
cudaFree(d_x_a_b);
cudaFree(d_H_F);
}

void decoder(const vector&x_train, int row_Xtrain, int col_Xtrain,const vector&a_f, int row_af, int col_af, const vector &b_f, vector &H_F, vector&HF_sudoinverse, vector&a_n, int row_an, int col_an, vector&b_n, std::string G, int N, int nerun){
int C;
double Max, Min;
//
double h_answer = 0, d_mean;
cudaMalloc((void
*)&d_mean, sizeof(double));
cudaMemcpy(d_mean, &h_answer, sizeof(double), cudaMemcpyHostToDevice);
printf(“h_answer: %f \n”, h_answer);

//host
double H_f_T = (double)malloc(NERUN* row_Data_train * sizeof(double));
double H_H_T1 = (double)malloc(NERUN* NERUN * sizeof(double));
double I = (double)malloc(NERUN* NERUN * sizeof(double));
double H_H_T_I = (double)malloc(NERUN* NERUN * sizeof(double));
//double HF_sudoinverse = (double)malloc(NERUN * row_Data_train * sizeof(double));
//vectorHF_sudoinverse(NERUN * row_Data_train);
double x_train_norm = (double)malloc(row_Xtrain * col_Xtrain * sizeof(double));
double x_train_n_L = (double)malloc(row_Xtrain * col_Xtrain* sizeof(double));
//double a_n = (double)malloc(NERUN* col_Xtrain * sizeof(double));
double H_a_n = (double)malloc(row_Data_train * col_Xtrain * sizeof(double));
double* inv = (double*)malloc(NERUN * NERUN * sizeof(double));

//device
double *d_x_train ,*d_x_train_norm, *d_x_train_n_a, *d_x_train_n_L, *d_H_F, *d_H_f_T,
*d_H_H_T1, *d_H_H_T2, *d_I, *d_H_H_T_I, *d_inv, *d_HF_sudoinverse , *d_a_n , *d_H_a_n;
cudaMalloc((void *)&d_x_train, sizeof(double) x_train.size());
cudaMalloc((void )&d_x_train_norm, sizeof(double) row_Xtrain col_Xtrain);
cudaMalloc((void )&d_x_train_n_a, sizeof(double) row_Xtrain col_Xtrain);
cudaMalloc((void )&d_x_train_n_L, sizeof(double) row_Xtrain col_Xtrain);
cudaMalloc((void *)&d_H_F, sizeof(double) H_F.size());
cudaMalloc((void *)&d_H_f_T, sizeof(double) H_F.size());
cudaMalloc((void )&d_H_H_T1, sizeof(double) NERUN NERUN);
cudaMalloc((void *)&d_H_H_T2, sizeof(double) row_Data_train * row_Data_train );
cudaMalloc((void **)&d_I, sizeof(double) * NERUN * NERUN);
cudaMalloc((void **)&d_H_H_T_I, sizeof(double) * NERUN * NERUN);
cudaMalloc((void **)&d_inv, sizeof(double) * NERUN * NERUN);
cudaMalloc((void )&d_HF_sudoinverse, sizeof(double) NERUN row_Data_train);
cudaMalloc((void )&d_a_n, sizeof(double) NERUN col_Xtrain);
cudaMalloc((void )&d_H_a_n, sizeof(double) row_Data_train col_Xtrain);

//Transfer Data from Host To Device
//thrust::device_vector ix_train(x_train);
//double* d_x_train = thrust::raw_pointer_cast(&ix_train[0]);
cudaMemcpy(d_x_train, x_train.data(), sizeof(double)* x_train.size(), cudaMemcpyHostToDevice);
//cudaMemcpy(d_x_train, x_train, sizeof(double)* row_Data_train* COL, cudaMemcpyHostToDevice);

//////////////////////////////////////////------------call funcation-----------//////////////////////////////////////////
encoder(x_train, row_Xtrain, col_Xtrain, a_f,row_af, col_af, b_f, H_F, “sigmoid”);
// transfer data from host to device
cudaMemcpy(d_H_F, H_F.data(), sizeof(double)* H_F.size(), cudaMemcpyHostToDevice);
Transpose(d_H_f_T, d_H_F, row_Data_train, NERUN);
//test
cudaMemcpy(H_f_T, d_H_f_T, sizeof(double)NERUN row_Data_train, cudaMemcpyDeviceToHost);

if (N > nerun){
multi(d_H_f_T, d_H_F, d_H_H_T1, NERUN, row_Data_train, row_Data_train, NERUN, NERUN, NERUN);
//test
cudaMemcpy(H_H_T1, d_H_H_T1, sizeof(double)NERUNNERUN, cudaMemcpyDeviceToHost);
unit_matrix_cpu(d_I, NERUN, NERUN);
//test
//cudaMemcpy(I, d_I, sizeof(double)NERUNNERUN, cudaMemcpyDeviceToHost);
C = pow(10, 6);
divisional_cpu(C, d_I, NERUN, NERUN);
//test
cudaMemcpy(I, d_I, sizeof(double)NERUNNERUN, cudaMemcpyDeviceToHost);
SUM2D_cpu(d_H_H_T1, d_I, d_H_H_T_I, NERUN, NERUN);
//test
cudaMemcpy(H_H_T_I, d_H_H_T_I, sizeof(double)NERUNNERUN, cudaMemcpyDeviceToHost);
inv = inverse(H_H_T_I, NERUN);
cudaMemcpy(d_inv, inv, sizeof(double)* NERUN* NERUN, cudaMemcpyHostToDevice);
multi(d_inv, d_H_f_T, d_HF_sudoinverse, NERUN, NERUN, NERUN, row_Data_train, NERUN, row_Data_train);
//test
cudaMemcpy(HF_sudoinverse.data(), d_HF_sudoinverse, sizeof(double)*NERUN * row_Data_train, cudaMemcpyDeviceToHost);
}

if (N < nerun){
multi(d_H_F, d_H_f_T, d_H_H_T2, row_Data_train, NERUN, NERUN, row_Data_train, row_Data_train, row_Data_train );
unit_matrix_cpu(d_I, NERUN, NERUN);
C = pow(10, 6);
divisional_cpu(C, d_I, NERUN, NERUN);
SUM2D_cpu(d_H_H_T2, d_I, d_H_H_T_I, NERUN, NERUN);
inv = inverse(d_H_H_T_I, NERUN);
cudaMemcpy(d_inv, inv, sizeof(double)* NERUN* NERUN, cudaMemcpyHostToDevice);
multi(d_inv, d_H_f_T, d_HF_sudoinverse, NERUN, NERUN, NERUN, row_Data_train, NERUN, row_Data_train);
}

if (G == “sinus”){
MAX_MIN_Matrix(d_x_train, row_Xtrain * col_Xtrain, &Max, &Min);
Norm_alize_cpu(d_x_train, d_x_train_norm, Min, Max, row_Xtrain * col_Xtrain);
arcsinus(d_x_train_norm, d_x_train_n_a, row_Xtrain *col_Xtrain);
multi(d_HF_sudoinverse, d_x_train_n_a, d_a_n, NERUN, row_Data_train, row_Xtrain, col_Xtrain, NERUN, col_Xtrain);
}

if (G == “sigmoid”){
MAX_MIN_Matrix(d_x_train, row_Xtrain * col_Xtrain, &Min, &Max);
//std::cout << “min =” << Min << “\n”;
//std::cout << “max =” << Max << “\n”;
Norm_alize_cpu(d_x_train, d_x_train_norm, Min, Max, row_Xtrain * col_Xtrain);
//test
cudaMemcpy(x_train_norm, d_x_train_norm, sizeof(double)row_Xtrain * col_Xtrain, cudaMemcpyDeviceToHost);
negative_log(d_x_train_norm, d_x_train_n_L, row_Xtrain * col_Xtrain);
//test
cudaMemcpy(x_train_n_L, d_x_train_n_L, sizeof(double)row_Xtrain * col_Xtrain, cudaMemcpyDeviceToHost);
multi(d_HF_sudoinverse, d_x_train_n_L, d_a_n, NERUN, row_Data_train, row_Xtrain, col_Xtrain, NERUN, col_Xtrain);
//test
cudaMemcpy(a_n.data(), d_a_n, sizeof(double)
NERUN * col_Xtrain, cudaMemcpyDeviceToHost);
}
multi(d_H_F, d_a_n, d_H_a_n, row_Data_train, NERUN, NERUN, col_Xtrain, row_Data_train, col_Xtrain);
//test
cudaMemcpy(H_a_n, d_H_a_n, sizeof(double)
row_Data_train * col_Xtrain, cudaMemcpyDeviceToHost);
MSE(d_H_a_n, d_x_train_n_L, row_Data_train * col_Xtrain, d_mean);

cudaMemcpy(&h_answer, d_mean, sizeof(double), cudaMemcpyDeviceToHost);
printf(“h_answer: %f \n”, h_answer);
double b = sqrt(h_answer);
printf(“b: %f \n”, b);

for (int i = 0; i < NERUN; i++){
b_n[i] = b;
}
/////////////////////////////////////////-----------------end---------------------///////////////////////////////////////
cudaFree(d_x_train);
cudaFree(d_x_train_norm);
cudaFree(d_x_train_n_a);
cudaFree(d_x_train_n_L);
cudaFree(d_H_F);
cudaFree(d_H_f_T);
cudaFree(d_H_H_T1);
cudaFree(d_H_H_T2);
cudaFree(d_I);
cudaFree(d_H_H_T_I);
cudaFree(d_inv);
cudaFree(d_HF_sudoinverse);
cudaFree(d_a_n);
cudaFree(d_H_a_n);
}

void Autoencoder(int n , int L , vectorX_train){
int k = 0 , i=0;

vectora_f(COL* NERUN);
vectorb_f(NERUN);
vectorH_F(row_Data_train * NERUN);
vectorHF_sudoinverse(NERUN * row_Data_train);
vectorb_n(NERUN);
vectora_n;
vectora_f_new(NERUN * NERUN );
vectorb_f_new(NERUN);

a_f_new = rand_data(NERUN * NERUN);
b_f_new = rand_data(NERUN);

while (k < L){
if (k == 0){
a_f = rand_data(COL_NERUN);
b_f = rand_data(NERUN);
decoder(X_train, row_Data_train, COL, a_f, COL, NERUN, b_f, H_F, HF_sudoinverse, a_n, NERUN, COL, b_n, “sigmoid”, 300, 200);
double d_a_n, d_a_f;
cudaMalloc((void )&d_a_n, sizeof(double) NERUN COL);
cudaMalloc((void )&d_a_f, sizeof(double) NERUN COL);
cudaMemcpy(d_a_n, a_n.data(), sizeof(double)
NERUN
COL, cudaMemcpyHostToDevice);
Transpose(d_a_f, d_a_n, NERUN, COL);
cudaMemcpy(a_f.data(), d_a_f, sizeof(double)* a_n.size(), cudaMemcpyDeviceToHost);
for (i = 0; i < b_n.size(); i++){
b_f[i] = b_n[i];}
}
else if (k < L){
decoder(X_train, row_Data_train, COL, a_f, COL, NERUN, b_f, H_F, HF_sudoinverse, a_n, NERUN, COL, b_n, “sigmoid”, 300, 200);
double d_a_n, d_a_f;
cudaMalloc((void )&d_a_n, sizeof(double) NERUN COL);
cudaMalloc((void )&d_a_f, sizeof(double)NERUN COL);
cudaMemcpy(d_a_n, a_n.data(), sizeof(double)
NERUN
COL, cudaMemcpyHostToDevice);
Transpose(d_a_f, d_a_n, NERUN, COL);
cudaMemcpy(a_f.data(), d_a_f, sizeof(double)
NERUN
COL, cudaMemcpyDeviceToHost);
for (i = 0; i < b_n.size(); i++){
b_f[i] = b_n[i];}
}
k++;
}
}

global void Normalize_kernel(double *input1 ,double *input2 ,double *output , int row , int col){
int id =blockIdx.x * blockDim.x + threadIdx.x;
if( id < row_liver_col_liver )
output[ id ] = input1[id] / input2[id] ;

}

// ba estefade az in tabe max meghdar dar har sotun ra be dast miavrim
//dar in tabe az library cublas estefade shude

double* MAX_VALUE_ECOL(double *X){

thrust::host_vector<double> h_data;
thrust::device_vector<double>d_data;

// vectorvalue_result(col_liver);
double* value_result =(double*)malloc( col_liver * sizeof(double));
double* result_h_data =(double*)malloc( row_liver* col_liver * sizeof(double));

 for(int i=0; i< row_liver_col_liver ; i++){
    h_data.push_back(X[i]);
    d_data.push_back(h_data[i]);

}

cublasHandle_t handle;
cublasCreate(&handle);

int result;
for (int i=0; i< col_liver ; i++) { 
cublasIdamax(handle, row_liver , (double*)thrust::raw_pointer_cast(d_data.data())  + i, col_liver , &result);
   // printf("%i %f\n",result,h_data[i+(result-1)* col_liver]);
    value_result[i] = h_data[i+(result-1)* col_liver] ;
   // printf(" %f\n",value_result[i]);

}
return value_result;
}

extern “C”
void Normalize_cpu(double *input1 ,double *input2 ,double *output , int row , int col) {
int n = row_liver * col_liver;
int blockSize = 512;
int gridSize = (int) ceil((float) n / blockSize);
Normalize_kernel<<< gridSize , blockSize >>>(input1 ,input2 ,output , row , col);
}
/*int k=0;
for(int i=0 ; i<h_data.size() ;i++){
if( k == col_liver ){
k=0;}
result_h_data[i] = h_data[i] / value_result [k];
k++;
}
for(int i=0 ; i<h_data.size() ;i++){
printf("%f \n", result_h_data[i]); }

return result_h_data;

}/
//braye inkae betavan tabe Normalize ra movazi kard in vecetor ra be andaze data set afzayesh mydahim yani inkae dar brodar X faghat 7maghdar max dar har sotun vojud dard ke baestefade az in tabe vector be andaze voridi durst kardim
extern “C”
double
vector_big(double* X){
double MV = (double)malloc(row_liver* col_liver * sizeof(double));
int j = 0;
for (int i = 0; i < row_liver* col_liver; i++){
if (j >= col_liver){
j = 0;
}
MV[i] = X[j];
j++;
}
return MV;
}
extern “C”
void data(double* X , double* Y_train ,double* Y_test ,double* X_train, double* X_test) {
double mazrab[row_liver];
int j = 0 , l=0 , m=0 , n=0 , w=0;
mazrab[w] = col_liver -1;
for (int r=0 ; r < (row_liver * col_liver) -1 ; ){
r = mazrab[w] + col_liver;
w++;
mazrab[w] = r ;
}
int k=0;
for(int i=0 ; i<row_liver * col_liver ; i++ ){
if (i<row_x_train && i!= mazrab[k]){
X_train[j] = X[i];
j++;
}
if (i<row_x_train && (i == mazrab[k]) ){
Y_train[l] = X[i];
l++;
k++;
}
if(i >=row_x_train && i != mazrab[k]){
X_test[m] = X[i];
m++;
}
if(i >=row_x_train && (i == mazrab[k])){
Y_test[n] = X[i] ;
n++;
k++;
}
}
}
extern “C”
void CLASStovector (double array ,double result , int size_array){
int j=0 ;
for(int i=0 ; i<size_array ; i++){
if (array[i] ==0.5){
result [j] = 1;
result [j+1] = 0;
j+=2; }
if (array [i] == 1){
result[j] = 0;
result[j+1] = 1;
j+=2; }
}
}
extern “C”
double
index_Max(double
input ,int size_matrix , int classs){
double* arg_max = (double*)malloc(data_train* sizeof(double));
int j=0 ,k=0 , count =-1 , number=0 ;
double Max =-1;
for(int i=0 ; i< size_matrix +1 ; i++ ){
count++;
if(count == classs){
j=0;
Max=-1;
count=0;
arg_max[k] = number;
k++;

    }
if( j < classs && input[i] > Max) {
    Max = input[i] ;
    number = i ;
    j++ ;

}

}
return arg_max;
}
extern “C++”
std::vector rand_data(int size_matrix){

size_t n = size_matrix;
size_t i;
curandGenerator_t gen;
double *devData ;
double  mean ,stddev ;

// double input = (double)malloc(n* sizeof(double));
vectorinput(size_matrix);

/* Allocate n floats on device */
cudaMalloc((void **)&devData, n*sizeof(double));

/* Create pseudo-random number generator */
curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT);

/* Set seed */
curandSetPseudoRandomGeneratorSeed(gen, 1234ULL);

/* Generate n floats on device */
curandGenerateNormalDouble(gen, devData, n, 0.0,1.0);

/* Copy device memory to host */
cudaMemcpy(input.data(), devData, n * sizeof(double),
    cudaMemcpyDeviceToHost);

/* Show result */

/* for(i = 0; i < n; i++) {
printf(“i= %d , %1.20f “,i, input[i]);
}
printf(”\n”);*/

/* Cleanup */
curandDestroyGenerator(gen);
cudaFree(devData);  
return input;

}
// Compute C = A * B
//dar
global void matrixMultiplyShared(double * A, double * B, double * C,
int numARows, int numAColumns,
int numBRows, int numBColumns,
int numCRows, int numCColumns)
{
shared double sA[32][32 + 1]; // Tile size of 32x32
shared double sB[32][32 + 1];

int Row = blockDim.y*blockIdx.y + threadIdx.y;
int Col = blockDim.x*blockIdx.x + threadIdx.x;
double Cvalue = 0.0;
sA[threadIdx.y][threadIdx.x] = 0.0;
sB[threadIdx.y][threadIdx.x] = 0.0;

for (int k = 0; k < (((numAColumns - 1)/ 32) + 1); k++)
{
    if ( (Row < numARows) && (threadIdx.x + (k*32)) < numAColumns)
    {
        sA[threadIdx.y][threadIdx.x] = A[(Row*numAColumns) + threadIdx.x + (k*32)];
    }
    else
    {
        sA[threadIdx.y][threadIdx.x] = 0.0;
    }            
    if ( Col < numBColumns && (threadIdx.y + k*32) < numBRows)
    {
        sB[threadIdx.y][threadIdx.x] = B[(threadIdx.y + k*32)*numBColumns + Col];
    }
    else
    {
        sB[threadIdx.y][threadIdx.x] = 0.0;
    }            
    __syncthreads();

    for (int j = 0; j < 32; ++j)
    {
        Cvalue += sA[threadIdx.y][j] * sB[j][threadIdx.x];
    }
}
if (Row < numCRows && Col < numCColumns)
{
    C[Row*numCColumns + Col] = Cvalue;
}

}
extern “C”
void multi(double * A, double * B, double * C, int numARows,
int numAColumns, int numBRows, int numBColumns,
int numCRows, int numCColumns)
{
// Initialize the grid and block dimensions
dim3 dimBlock(32, 32, 1);
dim3 dimGrid((numCColumns / 32) + 1, (numCRows / 32) + 1, 1);

//@@ Launch the GPU Kernel here
matrixMultiplyShared << <dimGrid, dimBlock >> >(A, B, C, numARows, numAColumns, numBRows, numBColumns, numCRows, numCColumns);

}
global void SUM_kernel( double* matrix, const double* vector , double *output ,const unsigned int size )
{
// get the current element index for the thread
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < size)
{
// sum the current element with the
output[idx] = matrix[idx] + vector[threadIdx.x];
}
}

extern “C”
void SUM ( double* matrix, const double* vector , double* output ,const unsigned int size , int row_out , int col_out)
{
int gridSize = row_out ;
int blockSize = col_out;

SUM_kernel<<< gridSize , blockSize >>>( matrix , vector , output , size );
}

global void Sigmoid_kernel(double* input, double* output ,const unsigned int size)
{
// get the current element index for the thread
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < size)
{
//az har element sigmoid migirim
output[idx] = ( 1 / (1 + exp((-1) * input [idx])));
}
}

extern “C”
void Sigmoid(double* input, double* output , const unsigned int size)
{
int blockSize = 1024;
int gridSize = (int) ceil((float) size / blockSize);

Sigmoid_kernel<<< gridSize , blockSize >>>( input , output , size );
}

global void sinus_kernel(double *input , double *output , int size)
{
// get the current element index for the thread
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < size)
{
// az har element sin migigrim
output[idx] = sin( input[idx] );
}
}

extern “C”
void sinus(double *input , double *output , int size)
{
int blockSize = 1024;
int gridSize = (int) ceil((float) size / blockSize);

sinus_kernel<<< gridSize , blockSize >>>( input , output , size );

}

global void transpose_kernel(double *odata, double *idata, int width, int height)
{
shared double block[BLOCK_DIM][BLOCK_DIM+1];

// read the matrix tile into shared memory
// load one element per thread from device memory (idata) and store it
// in transposed order in block
unsigned int xIndex = blockIdx.x * BLOCK_DIM + threadIdx.x;
unsigned int yIndex = blockIdx.y * BLOCK_DIM + threadIdx.y;
if((xIndex < width) && (yIndex < height))
{
unsigned int index_in = yIndex * width + xIndex;
block[threadIdx.y][threadIdx.x] = idata[index_in];
}

    // synchronise to ensure all writes to block[][] have completed

__syncthreads();

// write the transposed matrix tile to global memory (odata) in linear order
xIndex = blockIdx.y * BLOCK_DIM + threadIdx.x;
yIndex = blockIdx.x * BLOCK_DIM + threadIdx.y;
if((xIndex < height) && (yIndex < width))
{
unsigned int index_out = yIndex * height + xIndex;
odata[index_out] = block[threadIdx.x][threadIdx.y];
}
}

extern “C”
void Transpose(double *odata, double *idata, int width, int height)
{

// setup execution parameters
int gridSize_x = (int) ceil((float) width / BLOCK_DIM);
int gridSize_y = (int) ceil((float) height / BLOCK_DIM);
dim3 grid(gridSize_x, gridSize_y , 1);
dim3 threads(BLOCK_DIM, BLOCK_DIM, 1);

transpose_kernel<<< grid, threads >>>(odata, idata, width , height );

}

global void unit_matrix_kernel(double *I, int numR, int numC) {

int x = blockDim.x*blockIdx.x + threadIdx.x;
int y = blockDim.y*blockIdx.y + threadIdx.y;
if(y < numR && x < numC) {
      if(x == y)
            I[numR * y + x ] =1;

     if( x != y)
          I[numR * y + x ] =0;
}

}
extern “C”
void unit_matrix_cpu(double *I ,int ROW ,int COL ) {

int blockSize = 32;
int gridSize_x = (int) ceil((float) COL / blockSize);
int gridSize_y = (int) ceil((float) ROW / blockSize);
dim3 dimGrid(gridSize_x , gridSize_y );
//col , row dar inja be andaze satar & sotune matrix ast…
dim3 dimBlock( blockSize , blockSize );
unit_matrix_kernel<<<dimGrid,dimBlock>>>(I , ROW , COL);
}

global void divisional_Kernel(int adad , double *I, int numR, int numC) {

int x = blockDim.x*blockIdx.x + threadIdx.x;
int y = blockDim.y*blockIdx.y + threadIdx.y;
if(y < numR && x < numC) {
      if(x == y){
       I[numR * y + x ] = I[numR * y + x ] / adad ;

}
}
}
extern “C”
void divisional_cpu(int adad , double *I, int ROW ,int COL) {

int blockSize = 32;
int gridSize_x = (int) ceil((float) COL / blockSize);
int gridSize_y = (int) ceil((float) ROW / blockSize);
dim3 dimGrid(gridSize_x , gridSize_y );
//col , row dar inja be andaze satar & sotune matrix ast…
dim3 dimBlock( blockSize , blockSize );
divisional_Kernel<<<dimGrid,dimBlock>>>(1000000,I , ROW , COL);
}

// grid 2D block 2D
global void SUM2D_kernel(double *A, double *B, double *C, int nx , int ny)
{
unsigned int ix = threadIdx.x + blockIdx.x * blockDim.x;
unsigned int iy = threadIdx.y + blockIdx.y * blockDim.y;
unsigned int idx = iy * nx + ix;
if (iy < nx && ix < ny)
C[idx] = A[idx] + B[idx];
}
extern “C”
void SUM2D_cpu(double *A, double *B, double *C, const int nx, const int ny)
{
int dimx = 32;
int dimy = 32;
dim3 block(dimx, dimy);
dim3 grid((nx + block.x - 1) / block.x, (ny + block.y - 1) / block.y);

 SUM2D_kernel<<<grid, block>>>(A, B, C, nx, ny);

}

extern “C”
double* inverse(double* L, int n)
{
cublasHandle_t cu_cublasHandle;
cublasCreate(&cu_cublasHandle);
double** adL;
double** adC;
double* dL;
double* dC;
int* dLUPivots;
int* dLUInfo;

size_t szA = n * n * sizeof(double);

cudaMalloc(&adL, sizeof(double*));
cudaMalloc(&adC, sizeof(double*));
cudaMalloc(&dL, szA);
cudaMalloc(&dC, szA);
cudaMalloc(&dLUPivots, n * sizeof(int));
cudaMalloc(&dLUInfo, sizeof(int));
cudaMemcpy(dL, L, szA, cudaMemcpyHostToDevice);
cudaMemcpy(adL, &dL, sizeof(double*), cudaMemcpyHostToDevice);
cudaMemcpy(adC, &dC, sizeof(double*), cudaMemcpyHostToDevice);

cublasDgetrfBatched(cu_cublasHandle, n, adL, n, dLUPivots, dLUInfo, 1);
cudaDeviceSynchronize();

 cublasDgetriBatched(cu_cublasHandle, n, (const double **)adL, n, dLUPivots, adC, n, dLUInfo, 1);
cudaDeviceSynchronize();


double* res = (double*)malloc(szA);

cudaMemcpy(res,dC , szA, cudaMemcpyDeviceToHost);

cudaFree(adL);
cudaFree(adC);
cudaFree(dL);
cudaFree(dC);
cudaFree(dLUPivots);
cudaFree(dLUInfo);
cublasDestroy(cu_cublasHandle);

return res;

}

//ba estefadeh az in funcation value max & min ra dar tamame matrix be dast miavarim …
extern “C”
void MAX_MIN_Matrix(double *input ,int size_matrix ,double *min , double *max ){

thrust::device_vector<double> d_A(size_matrix);

thrust::copy(input, input + size_matrix, d_A.begin());
thrust::minimum op;
thrust::maximum op1;
*min = thrust::reduce(d_A.begin(), d_A.end(), 1000000, op);
*max = thrust::reduce(d_A.begin(), d_A.end(), -1000000, op1);
}

global void Norm_alize_kernel(double *input, double *output, double min , double max ,int size)
{
double a = 0.1 , b = 0.9 ;
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < size)
output[idx] = (( input[idx] - min ) /( max - min )) * (b - a) + a;

}
extern “C”
void Norm_alize_cpu(double *input, double *output, double min , double max ,int size)
{
int blockSize = 512;
int gridSize = (int) ceil((float) size / blockSize);

 Norm_alize_kernel<<<gridSize, blockSize>>>(input, output, min,max , size);

}
global void negative_log_kernel(double *input ,double *output ,int size_matrix)
{
// get the current element index for the thread
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < size_matrix)
{
output[idx] = (-1) * log (( 1/input[idx]) - 1);
}
}

extern “C”
void negative_log (double *input ,double *output ,int size_matrix)
{

int blockSize = 512;
int gridSize = (int) ceil((float) size_matrix / blockSize);

 negative_log_kernel<<<gridSize, blockSize>>>( input , output , size_matrix);

}

global void arcsinus_kernel(double *input ,double *output ,int size_matrix)
{
// get the current element index for the thread
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < size_matrix)
{
output[idx] = asin(input[idx]);
}
}

extern “C”
void arcsinus (double *input ,double *output ,int size_matrix)
{

int blockSize = 512;
int gridSize = (int) ceil((float) size_matrix / blockSize);

 arcsinus_kernel<<<gridSize, blockSize>>>( input , output , size_matrix);

}

global void MSE_kernel(double* input1, double* input2, int size_matrix, double mse){
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < size_matrix)
atomicAdd(mse,(((input1[idx] - input2[idx])
(input1[idx] - input2[idx]))/size_matrix) );

}

extern “C”
void MSE(double *input1, double *input2, int size_matrix, double *mse)
{

int blockSize = 512;
int gridSize = (int)ceil((float)size_matrix / blockSize);

MSE_kernel << <gridSize, blockSize >> >(input1, input2, size_matrix, mse);

}

the problem occurred in this section of the code above,

else if (k < L){
decoder(X_train, row_Data_train, COL, a_f, COL, NERUN, b_f, H_F, HF_sudoinverse, a_n, NERUN, COL, b_n, “sigmoid”, 300, 200);

I haven’t this problem when (k==0)

if (k == 0){
a_f = rand_data(COL_NERUN);
b_f = rand_data(NERUN);
decoder(X_train, row_Data_train, COL, a_f, COL, NERUN, b_f, H_F, HF_sudoinverse, a_n, NERUN, COL, b_n, “sigmoid”, 300, 200);
double d_a_n, d_a_f;
cudaMalloc((void )&d_a_n, sizeof(double) NERUN COL);
cudaMalloc((void )&d_a_f, sizeof(double) NERUN COL);
cudaMemcpy(d_a_n, a_n.data(), sizeof(double)
NERUN
COL, cudaMemcpyHostToDevice);
Transpose(d_a_f, d_a_n, NERUN, COL);
cudaMemcpy(a_f.data(), d_a_f, sizeof(double)* a_n.size(), cudaMemcpyDeviceToHost);
for (i = 0; i < b_n.size(); i++){
b_f[i] = b_n[i];}
}

Best Regards

Arezoo Moradi