CUDA C++ Segmentation Fault

I want to parallelize my serial neural network code with CUDA, but there is a segmentation fault error (core dumped). My code is:

final.cu

#define w(i,j) w[(i)*(InputN*hn) + (j)]
#define v(i,j) v[(i)*(hn*OutN) + (j)]
#define x_out(i,j) x_out[(i)*(InputN) + (j)]
#define y(i,j) y[(i)*(OutN) + (j)]
#define hn_out(i,j) hn_out[(i)*(hn) + (j)]
#define y_out(i,j) y_out[(i)*(OutN) + (j)]
#define y_delta(i,j) y_delta[(i)*(OutN) + (j)]
#define hn_delta(i,j) hn_delta[(i)*(hn) + (j)]
#define deltav(i,j) deltav[(i)*(hn*OutN) + (j)]
#define deltaw(i,j) deltaw[(i)*(InputN*hn) + (j)]

#include <stdio.h>
#include <math.h>
#include <stdlib.h>
#include <string>
#include "/home/user/include_files/cuda-8.0/include/cuda.h"
#include "/home/user/include_files/cuda-8.0/include/cuda_runtime.h"
#include "/home/user/include_files/cuda-8.0/include/cuda_runtime_api.h"

#define datanum 4 		// number of training samples
#define InputN 16		// number of neurons in the input layer
#define hn 64			// number of neurons in the hidden layer
#define OutN 1			// number of neurons in the output layer
#define threads_per_block 16
#define blocks 4
#define MAX_RAND 255
#define MIN_RAND 10


using namespace std;


__global__ void neural_network_kernel (float *randData, int *times, int *loop, double *error, double *max, double *min, double *x_out, double *hn_out, double *y_out, double *y, double *w, double *v, double *deltaw, double *deltav, double *hn_delta, double *y_delta, double *alpha, double *beta, double *sumtemp, double *errtemp);

int main(int argc, char *argv[]){
	int times = 1000;
	double alpha = 0.1, beta = 0.1;

	srand (time(NULL));
	float randData[blocks*threads_per_block];
	for (int i=0; i<blocks*threads_per_block; i++)
	{
		randData[i] = rand()%100;	//Else, without %100, it returns some billions for number!
	}

	int loop = 0;
	double error[blocks*threads_per_block];
	double max[blocks*threads_per_block], min[blocks*threads_per_block];
	for (int i=0; i<blocks*threads_per_block; i++)
	{
		error[i] = 0.0;
		max[i] = 0.0;
		min[i]= 0.0;
	}

	double x_out[blocks*threads_per_block*sizeof(double)*InputN];
	double hn_out[blocks*threads_per_block*sizeof(double)*hn];
	double y_out[blocks*threads_per_block*sizeof(double)*OutN];
	double y[blocks*threads_per_block*sizeof(double)*OutN];
	double hn_delta[blocks*threads_per_block*sizeof(double)*hn];
	double y_delta[blocks*threads_per_block*sizeof(double)*OutN];
	double sumtemp[blocks*threads_per_block*sizeof(double)];
	double errtemp[blocks*threads_per_block*sizeof(double)];
	double w[blocks*threads_per_block*sizeof(double)*InputN*hn];
	double v[blocks*threads_per_block*sizeof(double)*hn*OutN];
	double deltaw[blocks*threads_per_block*sizeof(double)*InputN*hn];
	double deltav[blocks*threads_per_block*sizeof(double)*hn*OutN];

	double *max_p_GPU, *min_p_GPU, *error_p_GPU;
	float *randData_p_GPU;
	int *times_p_GPU, *loop_p_GPU;
	double *x_out_p_GPU, *hn_out_p_GPU, *y_out_p_GPU, *y_p_GPU, *w_p_GPU, *v_p_GPU;
	double *deltaw_p_GPU, *deltav_p_GPU, *hn_delta_p_GPU;
	double *y_delta_p_GPU, *alpha_p_GPU, *beta_p_GPU, *sumtemp_p_GPU, *errtemp_p_GPU;

	
	cudaMalloc((void **)&randData_p_GPU, blocks*threads_per_block*sizeof(float));
	cudaMemcpy(randData_p_GPU, randData, blocks*threads_per_block*sizeof(float), cudaMemcpyHostToDevice);
	cudaMalloc((void **)&times_p_GPU, sizeof(int));
	cudaMemcpy(times_p_GPU, &times, sizeof(int), cudaMemcpyHostToDevice);
	cudaMalloc((void **)&loop_p_GPU, sizeof(int));
	cudaMemcpy(loop_p_GPU, &loop, sizeof(int), cudaMemcpyHostToDevice);
	cudaMalloc((void **)&error_p_GPU, blocks*threads_per_block*sizeof(double));
	cudaMemcpy(error_p_GPU, error, sizeof(double), cudaMemcpyHostToDevice);
	cudaMalloc((void **)&max_p_GPU, blocks*threads_per_block*sizeof(double));
	cudaMemcpy(max_p_GPU, max, blocks*threads_per_block*sizeof(double), cudaMemcpyHostToDevice);
	cudaMalloc((void **)&min_p_GPU, blocks*threads_per_block*sizeof(double));
	cudaMemcpy(min_p_GPU, min, blocks*threads_per_block*sizeof(double), cudaMemcpyHostToDevice);	
	cudaMalloc((void **)&x_out_p_GPU, blocks*threads_per_block*sizeof(double)*InputN);
	cudaMemcpy(x_out_p_GPU, x_out, blocks*threads_per_block*sizeof(double)*InputN, cudaMemcpyHostToDevice);
	cudaMalloc((void **)&hn_out_p_GPU, blocks*threads_per_block*sizeof(double)*hn);
	cudaMemcpy(hn_out_p_GPU, hn_out, blocks*threads_per_block*sizeof(double)*hn, cudaMemcpyHostToDevice);
	cudaMalloc((void **)&y_p_GPU, blocks*threads_per_block*sizeof(double)*OutN);
	cudaMemcpy(y_p_GPU, y, blocks*threads_per_block*sizeof(double)*OutN, cudaMemcpyHostToDevice);
	cudaMalloc((void **)&y_out_p_GPU, sizeof(double)*(threads_per_block*OutN));
	cudaMemcpy(y_out_p_GPU, y_out, sizeof(double)*OutN, cudaMemcpyHostToDevice);
	cudaMalloc((void **)&hn_delta_p_GPU, blocks*threads_per_block*sizeof(double)*hn);
	cudaMemcpy(hn_delta_p_GPU, hn_delta, blocks*threads_per_block*sizeof(double)*hn, cudaMemcpyHostToDevice);
	cudaMalloc((void **)&y_delta_p_GPU, blocks*threads_per_block*sizeof(double)*OutN);
	cudaMemcpy(y_delta_p_GPU, y_delta, blocks*threads_per_block*sizeof(double)*OutN, cudaMemcpyHostToDevice);
	cudaMalloc((void **)&alpha_p_GPU, sizeof(double));
	cudaMemcpy(alpha_p_GPU, &alpha, sizeof(double), cudaMemcpyHostToDevice);
	cudaMalloc((void **)&beta_p_GPU, sizeof(double));
	cudaMemcpy(beta_p_GPU, &beta, sizeof(double), cudaMemcpyHostToDevice);
	cudaMalloc((void **)&sumtemp_p_GPU, blocks*threads_per_block*sizeof(double));
	cudaMemcpy(sumtemp_p_GPU, sumtemp, blocks*threads_per_block*sizeof(double), cudaMemcpyHostToDevice);
	cudaMalloc((void **)&errtemp_p_GPU, blocks*threads_per_block*sizeof(double));
	cudaMemcpy(errtemp_p_GPU, errtemp, blocks*threads_per_block*sizeof(double), cudaMemcpyHostToDevice);
	cudaMalloc((void **)&w_p_GPU, blocks*threads_per_block*sizeof(double)*InputN*hn);
	cudaMemcpy(w_p_GPU, w, blocks*threads_per_block*sizeof(double)*(InputN*hn), cudaMemcpyHostToDevice);
	cudaMalloc((void **)&v_p_GPU, blocks*threads_per_block*sizeof(double)*hn*OutN);
	cudaMemcpy(v_p_GPU, v, blocks*threads_per_block*sizeof(double)*(hn*OutN), cudaMemcpyHostToDevice);
	cudaMalloc((void **)&deltaw_p_GPU, blocks*threads_per_block*sizeof(double)*InputN*hn);
	cudaMemcpy(deltaw_p_GPU, deltaw, blocks*threads_per_block*sizeof(double)*(InputN*hn), cudaMemcpyHostToDevice);
	cudaMalloc((void **)&deltav_p_GPU, blocks*threads_per_block*sizeof(double)*hn*OutN);
	cudaMemcpy(deltav_p_GPU, deltav, blocks*threads_per_block*sizeof(double)*(hn*OutN), cudaMemcpyHostToDevice);

	neural_network_kernel<<<blocks, threads_per_block>>>(randData, times_p_GPU, loop_p_GPU, error_p_GPU, max_p_GPU, min_p_GPU, x_out_p_GPU, hn_out_p_GPU, y_out_p_GPU, y_p_GPU, w_p_GPU, v_p_GPU, deltaw_p_GPU, deltav_p_GPU, hn_delta_p_GPU, y_delta_p_GPU, alpha_p_GPU, beta_p_GPU, sumtemp_p_GPU, errtemp_p_GPU);

	cudaDeviceSynchronize();
}



// sigmoid serves as avtivation function
__device__ double sigmoid(double x){
	return(1.0 / (1.0 + exp(-x)));
}


__device__ int rand_kernel(int index, float *randData){
	float myrandf = randData[index];
	myrandf *= (MAX_RAND - MIN_RAND + 0.999999);
	myrandf += MIN_RAND;
	int myrand = (int)truncf(myrandf);
	return myrand;
}


__global__ void neural_network_kernel (float *randData, int *times, int *loop, double *error, double *max, double *min, double *x_out, double *hn_out, double *y_out, double *y, double *w, double *v, double *deltaw, double *deltav, double *hn_delta, double *y_delta, double *alpha, double *beta, double *sumtemp, double *errtemp)
{	
	int index = blockIdx.x * blockDim.x + threadIdx.x;
	
	struct{
		double input_kernel[InputN];
		double teach_kernel[OutN];
	}data_kernel[blocks*threads_per_block + datanum];
		
	for(int m=0; m<datanum; m++){
		for(int i=0; i<InputN; i++)
			data_kernel[threads_per_block + m].input_kernel[i] = (double)rand_kernel(index, randData)/32767.0;
		for(int i=0;i<OutN;i++)
			data_kernel[threads_per_block + m].teach_kernel[i] = (double)rand_kernel(index, randData)/32767.0;
	}
	
	__syncthreads();
		

	// Initialization
	for(int i=0; i<InputN; i++){
		for(int j=0; j<hn; j++){
			w(i,j) = ((double)rand_kernel(index, randData)/32767.0)*2-1;
			deltaw(i,j) = 0;
		}
	}
	for(int i=0; i<hn; i++){
		for(int j=0; j<OutN; j++){
			v(i,j) = ((double)rand_kernel(index, randData)/32767.0)*2-1;
			deltav(i,j) = 0;
		}
	}
	
while(loop[index] < *times){
		loop[index]++;
		error[index] = 0.0;
		for(int m=0; m<datanum ; m++){
			// Feedforward
			max[index] = 0.0;
			min[index] = 0.0;
			for(int i=0; i<InputN; i++){
				x_out(index,i) = data_kernel[threads_per_block + m].input_kernel[i];
				if(max[index] < x_out(index,i))
					max[index] = x_out(index,i);
				if(min[index] > x_out(index,i))
					min[index] = x_out(index,i);
			}
			for(int i=0; i<InputN; i++){
				x_out(index,i) = (x_out(index,i) - min[index]) / (max[index] - min[index]);
			}

			for(int i=0; i<OutN ; i++){
				y(index,i) = data_kernel[threads_per_block + m].teach_kernel[i];
			}

			for(int i=0; i<hn; i++){
				sumtemp[index] = 0.0;
				for(int j=0; j<InputN; j++)
					sumtemp[index] += w(j,i) * x_out(index,j);
				hn_out(index,i) = sigmoid(sumtemp[index]);		// sigmoid serves as the activation function
			}

			for(int i=0; i<OutN; i++){
				sumtemp[index] = 0.0;
				for(int j=0; j<hn; j++)
					sumtemp[index] += v(j,i) * hn_out(index,j);
				y_out(index,i) = sigmoid(sumtemp[index]);
			}

			// Backpropagation
			for(int i=0; i<OutN; i++){
				errtemp[index] = y(index,i) - y_out(index,i);
				y_delta(index,i) = -errtemp[index] * sigmoid(y_out(index,i)) * (1.0 - sigmoid(y_out(index,i)));
				error[index] += errtemp[index] * errtemp[index];
			}

			for(int i=0; i<hn; i++){
				errtemp[index] = 0.0;
				for(int j=0; j<OutN; j++)
					errtemp[index] += y_delta(index,j) * v(i,j);
				hn_delta(index,i) = errtemp[index] * (1.0 + hn_out(index,i)) * (1.0 - hn_out(index,i));
			}

			// Stochastic gradient descent
			for(int i=0; i<OutN; i++){
				for(int j=0; j<hn; j++){
					deltav(j,i) = (*alpha) * deltav(j,i) + (*beta) * y_delta(index,i) * hn_out(index,j);
					v(j,i) -= deltav(j,i);
				}
			}

			for(int i=0; i<hn; i++){
				for(int j=0; j<InputN; j++){
					deltaw(j,i) = (*alpha) * deltaw(j,i) + (*beta) * hn_delta(index,i) * x_out(index,j);
					w(j,i) -= deltaw(j,i);
				}
			}
		}

		// Global error
		error[index] = error[index] / 2;
		
		//printf("The %d th training, error: %0.10f\n", loop[index], error[index]);
	}
}
  1. when you are allocating an array, you don’t need to include sizeof(type) like you have here:
double x_out[blocks*threads_per_block*sizeof(double)*InputN];
                                      ^^^^^^^^^^^^^^

it should be sufficient to do this:

double x_out[blocks*threads_per_block*InputN];

(and the same for all others like this)

  1. you have too much stack data (this is what is actually causing the seg fault). Either move these definitions outside of main, or convert them to heap-based allocations. For example convert this:
double deltaw[blocks*threads_per_block*sizeof(double)*InputN*hn];

to this:

double *deltaw = (double *)malloc(blocks*threads_per_block*sizeof(double)*InputN*hn);

note that here you do need the sizeof(type). And repeat for all other allocations like this in main.