CUDA C++ Segmentation Fault

I want to parallelize my serial neural network code with CUDA, but there is a segmentation fault error (core dumped). My code is:

final.cu

#define w(i,j) w[(i)*(InputN*hn) + (j)]
#define v(i,j) v[(i)*(hn*OutN) + (j)]
#define x_out(i,j) x_out[(i)*(InputN) + (j)]
#define y(i,j) y[(i)*(OutN) + (j)]
#define hn_out(i,j) hn_out[(i)*(hn) + (j)]
#define y_out(i,j) y_out[(i)*(OutN) + (j)]
#define y_delta(i,j) y_delta[(i)*(OutN) + (j)]
#define hn_delta(i,j) hn_delta[(i)*(hn) + (j)]
#define deltav(i,j) deltav[(i)*(hn*OutN) + (j)]
#define deltaw(i,j) deltaw[(i)*(InputN*hn) + (j)]

#include <stdio.h>
#include <math.h>
#include <stdlib.h>
#include <string>
#include "/home/user/include_files/cuda-8.0/include/cuda.h"
#include "/home/user/include_files/cuda-8.0/include/cuda_runtime.h"
#include "/home/user/include_files/cuda-8.0/include/cuda_runtime_api.h"

#define datanum 4 		// number of training samples
#define InputN 16		// number of neurons in the input layer
#define hn 64			// number of neurons in the hidden layer
#define OutN 1			// number of neurons in the output layer
#define threads_per_block 16
#define blocks 4
#define MAX_RAND 255
#define MIN_RAND 10


using namespace std;


__global__ void neural_network_kernel (float *randData, int *times, int *loop, double *error, double *max, double *min, double *x_out, double *hn_out, double *y_out, double *y, double *w, double *v, double *deltaw, double *deltav, double *hn_delta, double *y_delta, double *alpha, double *beta, double *sumtemp, double *errtemp);

int main(int argc, char *argv[]){
	int times = 1000;
	double alpha = 0.1, beta = 0.1;

	srand (time(NULL));
	float randData[blocks*threads_per_block];
	for (int i=0; i<blocks*threads_per_block; i++)
	{
		randData[i] = rand()%100;	//Else, without %100, it returns some billions for number!
	}

	int loop = 0;
	double error[blocks*threads_per_block];
	double max[blocks*threads_per_block], min[blocks*threads_per_block];
	for (int i=0; i<blocks*threads_per_block; i++)
	{
		error[i] = 0.0;
		max[i] = 0.0;
		min[i]= 0.0;
	}

	double x_out[blocks*threads_per_block*sizeof(double)*InputN];
	double hn_out[blocks*threads_per_block*sizeof(double)*hn];
	double y_out[blocks*threads_per_block*sizeof(double)*OutN];
	double y[blocks*threads_per_block*sizeof(double)*OutN];
	double hn_delta[blocks*threads_per_block*sizeof(double)*hn];
	double y_delta[blocks*threads_per_block*sizeof(double)*OutN];
	double sumtemp[blocks*threads_per_block*sizeof(double)];
	double errtemp[blocks*threads_per_block*sizeof(double)];
	double w[blocks*threads_per_block*sizeof(double)*InputN*hn];
	double v[blocks*threads_per_block*sizeof(double)*hn*OutN];
	double deltaw[blocks*threads_per_block*sizeof(double)*InputN*hn];
	double deltav[blocks*threads_per_block*sizeof(double)*hn*OutN];

	double *max_p_GPU, *min_p_GPU, *error_p_GPU;
	float *randData_p_GPU;
	int *times_p_GPU, *loop_p_GPU;
	double *x_out_p_GPU, *hn_out_p_GPU, *y_out_p_GPU, *y_p_GPU, *w_p_GPU, *v_p_GPU;
	double *deltaw_p_GPU, *deltav_p_GPU, *hn_delta_p_GPU;
	double *y_delta_p_GPU, *alpha_p_GPU, *beta_p_GPU, *sumtemp_p_GPU, *errtemp_p_GPU;

	
	cudaMalloc((void **)&randData_p_GPU, blocks*threads_per_block*sizeof(float));
	cudaMemcpy(randData_p_GPU, randData, blocks*threads_per_block*sizeof(float), cudaMemcpyHostToDevice);
	cudaMalloc((void **)&times_p_GPU, sizeof(int));
	cudaMemcpy(times_p_GPU, &times, sizeof(int), cudaMemcpyHostToDevice);
	cudaMalloc((void **)&loop_p_GPU, sizeof(int));
	cudaMemcpy(loop_p_GPU, &loop, sizeof(int), cudaMemcpyHostToDevice);
	cudaMalloc((void **)&error_p_GPU, blocks*threads_per_block*sizeof(double));
	cudaMemcpy(error_p_GPU, error, sizeof(double), cudaMemcpyHostToDevice);
	cudaMalloc((void **)&max_p_GPU, blocks*threads_per_block*sizeof(double));
	cudaMemcpy(max_p_GPU, max, blocks*threads_per_block*sizeof(double), cudaMemcpyHostToDevice);
	cudaMalloc((void **)&min_p_GPU, blocks*threads_per_block*sizeof(double));
	cudaMemcpy(min_p_GPU, min, blocks*threads_per_block*sizeof(double), cudaMemcpyHostToDevice);	
	cudaMalloc((void **)&x_out_p_GPU, blocks*threads_per_block*sizeof(double)*InputN);
	cudaMemcpy(x_out_p_GPU, x_out, blocks*threads_per_block*sizeof(double)*InputN, cudaMemcpyHostToDevice);
	cudaMalloc((void **)&hn_out_p_GPU, blocks*threads_per_block*sizeof(double)*hn);
	cudaMemcpy(hn_out_p_GPU, hn_out, blocks*threads_per_block*sizeof(double)*hn, cudaMemcpyHostToDevice);
	cudaMalloc((void **)&y_p_GPU, blocks*threads_per_block*sizeof(double)*OutN);
	cudaMemcpy(y_p_GPU, y, blocks*threads_per_block*sizeof(double)*OutN, cudaMemcpyHostToDevice);
	cudaMalloc((void **)&y_out_p_GPU, sizeof(double)*(threads_per_block*OutN));
	cudaMemcpy(y_out_p_GPU, y_out, sizeof(double)*OutN, cudaMemcpyHostToDevice);
	cudaMalloc((void **)&hn_delta_p_GPU, blocks*threads_per_block*sizeof(double)*hn);
	cudaMemcpy(hn_delta_p_GPU, hn_delta, blocks*threads_per_block*sizeof(double)*hn, cudaMemcpyHostToDevice);
	cudaMalloc((void **)&y_delta_p_GPU, blocks*threads_per_block*sizeof(double)*OutN);
	cudaMemcpy(y_delta_p_GPU, y_delta, blocks*threads_per_block*sizeof(double)*OutN, cudaMemcpyHostToDevice);
	cudaMalloc((void **)&alpha_p_GPU, sizeof(double));
	cudaMemcpy(alpha_p_GPU, &alpha, sizeof(double), cudaMemcpyHostToDevice);
	cudaMalloc((void **)&beta_p_GPU, sizeof(double));
	cudaMemcpy(beta_p_GPU, &beta, sizeof(double), cudaMemcpyHostToDevice);
	cudaMalloc((void **)&sumtemp_p_GPU, blocks*threads_per_block*sizeof(double));
	cudaMemcpy(sumtemp_p_GPU, sumtemp, blocks*threads_per_block*sizeof(double), cudaMemcpyHostToDevice);
	cudaMalloc((void **)&errtemp_p_GPU, blocks*threads_per_block*sizeof(double));
	cudaMemcpy(errtemp_p_GPU, errtemp, blocks*threads_per_block*sizeof(double), cudaMemcpyHostToDevice);
	cudaMalloc((void **)&w_p_GPU, blocks*threads_per_block*sizeof(double)*InputN*hn);
	cudaMemcpy(w_p_GPU, w, blocks*threads_per_block*sizeof(double)*(InputN*hn), cudaMemcpyHostToDevice);
	cudaMalloc((void **)&v_p_GPU, blocks*threads_per_block*sizeof(double)*hn*OutN);
	cudaMemcpy(v_p_GPU, v, blocks*threads_per_block*sizeof(double)*(hn*OutN), cudaMemcpyHostToDevice);
	cudaMalloc((void **)&deltaw_p_GPU, blocks*threads_per_block*sizeof(double)*InputN*hn);
	cudaMemcpy(deltaw_p_GPU, deltaw, blocks*threads_per_block*sizeof(double)*(InputN*hn), cudaMemcpyHostToDevice);
	cudaMalloc((void **)&deltav_p_GPU, blocks*threads_per_block*sizeof(double)*hn*OutN);
	cudaMemcpy(deltav_p_GPU, deltav, blocks*threads_per_block*sizeof(double)*(hn*OutN), cudaMemcpyHostToDevice);

	neural_network_kernel<<<blocks, threads_per_block>>>(randData, times_p_GPU, loop_p_GPU, error_p_GPU, max_p_GPU, min_p_GPU, x_out_p_GPU, hn_out_p_GPU, y_out_p_GPU, y_p_GPU, w_p_GPU, v_p_GPU, deltaw_p_GPU, deltav_p_GPU, hn_delta_p_GPU, y_delta_p_GPU, alpha_p_GPU, beta_p_GPU, sumtemp_p_GPU, errtemp_p_GPU);

	cudaDeviceSynchronize();
}



// sigmoid serves as avtivation function
__device__ double sigmoid(double x){
	return(1.0 / (1.0 + exp(-x)));
}


__device__ int rand_kernel(int index, float *randData){
	float myrandf = randData[index];
	myrandf *= (MAX_RAND - MIN_RAND + 0.999999);
	myrandf += MIN_RAND;
	int myrand = (int)truncf(myrandf);
	return myrand;
}


__global__ void neural_network_kernel (float *randData, int *times, int *loop, double *error, double *max, double *min, double *x_out, double *hn_out, double *y_out, double *y, double *w, double *v, double *deltaw, double *deltav, double *hn_delta, double *y_delta, double *alpha, double *beta, double *sumtemp, double *errtemp)
{	
	int index = blockIdx.x * blockDim.x + threadIdx.x;
	
	struct{
		double input_kernel[InputN];
		double teach_kernel[OutN];
	}data_kernel[blocks*threads_per_block + datanum];
		
	for(int m=0; m<datanum; m++){
		for(int i=0; i<InputN; i++)
			data_kernel[threads_per_block + m].input_kernel[i] = (double)rand_kernel(index, randData)/32767.0;
		for(int i=0;i<OutN;i++)
			data_kernel[threads_per_block + m].teach_kernel[i] = (double)rand_kernel(index, randData)/32767.0;
	}
	
	__syncthreads();
		

	// Initialization
	for(int i=0; i<InputN; i++){
		for(int j=0; j<hn; j++){
			w(i,j) = ((double)rand_kernel(index, randData)/32767.0)*2-1;
			deltaw(i,j) = 0;
		}
	}
	for(int i=0; i<hn; i++){
		for(int j=0; j<OutN; j++){
			v(i,j) = ((double)rand_kernel(index, randData)/32767.0)*2-1;
			deltav(i,j) = 0;
		}
	}
	
while(loop[index] < *times){
		loop[index]++;
		error[index] = 0.0;
		for(int m=0; m<datanum ; m++){
			// Feedforward
			max[index] = 0.0;
			min[index] = 0.0;
			for(int i=0; i<InputN; i++){
				x_out(index,i) = data_kernel[threads_per_block + m].input_kernel[i];
				if(max[index] < x_out(index,i))
					max[index] = x_out(index,i);
				if(min[index] > x_out(index,i))
					min[index] = x_out(index,i);
			}
			for(int i=0; i<InputN; i++){
				x_out(index,i) = (x_out(index,i) - min[index]) / (max[index] - min[index]);
			}

			for(int i=0; i<OutN ; i++){
				y(index,i) = data_kernel[threads_per_block + m].teach_kernel[i];
			}

			for(int i=0; i<hn; i++){
				sumtemp[index] = 0.0;
				for(int j=0; j<InputN; j++)
					sumtemp[index] += w(j,i) * x_out(index,j);
				hn_out(index,i) = sigmoid(sumtemp[index]);		// sigmoid serves as the activation function
			}

			for(int i=0; i<OutN; i++){
				sumtemp[index] = 0.0;
				for(int j=0; j<hn; j++)
					sumtemp[index] += v(j,i) * hn_out(index,j);
				y_out(index,i) = sigmoid(sumtemp[index]);
			}

			// Backpropagation
			for(int i=0; i<OutN; i++){
				errtemp[index] = y(index,i) - y_out(index,i);
				y_delta(index,i) = -errtemp[index] * sigmoid(y_out(index,i)) * (1.0 - sigmoid(y_out(index,i)));
				error[index] += errtemp[index] * errtemp[index];
			}

			for(int i=0; i<hn; i++){
				errtemp[index] = 0.0;
				for(int j=0; j<OutN; j++)
					errtemp[index] += y_delta(index,j) * v(i,j);
				hn_delta(index,i) = errtemp[index] * (1.0 + hn_out(index,i)) * (1.0 - hn_out(index,i));
			}

			// Stochastic gradient descent
			for(int i=0; i<OutN; i++){
				for(int j=0; j<hn; j++){
					deltav(j,i) = (*alpha) * deltav(j,i) + (*beta) * y_delta(index,i) * hn_out(index,j);
					v(j,i) -= deltav(j,i);
				}
			}

			for(int i=0; i<hn; i++){
				for(int j=0; j<InputN; j++){
					deltaw(j,i) = (*alpha) * deltaw(j,i) + (*beta) * hn_delta(index,i) * x_out(index,j);
					w(j,i) -= deltaw(j,i);
				}
			}
		}

		// Global error
		error[index] = error[index] / 2;
		
		//printf("The %d th training, error: %0.10f\n", loop[index], error[index]);
	}
}

Any help/suggestions/ideas would be extremely helpful, thank you in advance!

Segmentation faults occur when you have out-of-bounds memory accesses in the host code. So you would proceed to locate the error as you normally would with code that does not involve CUDA. In other words, use standard debugging techniques.

You could try the valgrind utility, or instrument the array accesses in the code to manually check indices / addresses against the relevant allocation bound.

Thank you for answering!
Actually, I have it print something exactly after main() and it does not even do this, immediately segfault.
The memory access in the host code you mention, you mean in normal mallocs? Or it can happen even in cudaMalloc and cudaMemcpy.
Because in host side, I declare and initialize (1d) arrays of predefined size.

I tried cuda-memcheck and I get the following:

cuda-memcheck final
========= CUDA-MEMCHECK
========= Error: process didn’t terminate successfully
========= The application may have hit an error when dereferencing Unified Memory from the host. Please rerun the application under cuda-gdb or Nsight Eclipse Edition to catch host side errors.
========= Internal error (20)
========= No CUDA-MEMCHECK results found

Actually, now after some modifications, the cuda-memcheck gives:

*** Error in `final': free(): invalid next size (normal): 0x00000000010e6d40 ***
======= Backtrace: =========
/lib/x86_64-linux-gnu/libc.so.6(+0x777e5)[0x7f6b1ae177e5]
/lib/x86_64-linux-gnu/libc.so.6(+0x8037a)[0x7f6b1ae2037a]
/lib/x86_64-linux-gnu/libc.so.6(cfree+0x4c)[0x7f6b1ae2453c]
/lib/x86_64-linux-gnu/libc.so.6(_IO_setb+0x4b)[0x7f6b1ae1b54b]
/lib/x86_64-linux-gnu/libc.so.6(_IO_file_close_it+0xae)[0x7f6b1ae198ee]
/lib/x86_64-linux-gnu/libc.so.6(fclose+0x18f)[0x7f6b1ae0d3ef]
/usr/lib/x86_64-linux-gnu/libcuda.so.1(+0x1bd299)[0x7f6b1a26b299]
/usr/lib/x86_64-linux-gnu/libcuda.so.1(+0x195b93)[0x7f6b1a243b93]
/usr/lib/x86_64-linux-gnu/libcuda.so.1(+0xbdb68)[0x7f6b1a16bb68]
/usr/lib/x86_64-linux-gnu/libcuda.so.1(cuInit+0x4d)[0x7f6b1a29193d]
final[0x41d854]
final[0x41ea63]
/lib/x86_64-linux-gnu/libpthread.so.0(+0xea99)[0x7f6b1b914a99]
final[0x452369]
final[0x41f08e]
final[0x441f02]
final[0x40314a]
/lib/x86_64-linux-gnu/libc.so.6(__libc_start_main+0xf0)[0x7f6b1adc0830]
final[0x402c19]
======= Memory map: ========
00400000-00470000 r-xp 00000000 08:01 19534391                           /home/inglezos/ergasia_4_cuda/HERE_sept28/final
0066f000-00672000 r--p 0006f000 08:01 19534391                           /home/inglezos/ergasia_4_cuda/HERE_sept28/final
00672000-00673000 rw-p 00072000 08:01 19534391                           /home/inglezos/ergasia_4_cuda/HERE_sept28/final
00673000-00674000 rw-p 00000000 00:00 0 
010c1000-010f3000 rw-p 00000000 00:00 0                                  [heap]
7f6b14000000-7f6b14021000 rw-p 00000000 00:00 0 
7f6b14021000-7f6b18000000 ---p 00000000 00:00 0 
7f6b19e62000-7f6b19ea4000 r-xp 00000000 08:01 3285377                    /usr/lib/nvidia-375/libnvidia-fatbinaryloader.so.375.66
7f6b19ea4000-7f6b1a0a3000 ---p 00042000 08:01 3285377                    /usr/lib/nvidia-375/libnvidia-fatbinaryloader.so.375.66
7f6b1a0a3000-7f6b1a0ad000 rw-p 00041000 08:01 3285377                    /usr/lib/nvidia-375/libnvidia-fatbinaryloader.so.375.66
7f6b1a0ad000-7f6b1a0ae000 rw-p 00000000 00:00 0 
7f6b1a0ae000-7f6b1a770000 r-xp 00000000 08:01 787617                     /usr/lib/x86_64-linux-gnu/libcuda.so.375.66
7f6b1a770000-7f6b1a96f000 ---p 006c2000 08:01 787617                     /usr/lib/x86_64-linux-gnu/libcuda.so.375.66
7f6b1a96f000-7f6b1aa8a000 rw-p 006c1000 08:01 787617                     /usr/lib/x86_64-linux-gnu/libcuda.so.375.66
7f6b1aa8a000-7f6b1aa97000 rw-p 00000000 00:00 0 
7f6b1aa97000-7f6b1ab9f000 r-xp 00000000 08:01 26744286                   /lib/x86_64-linux-gnu/libm-2.23.so
7f6b1ab9f000-7f6b1ad9e000 ---p 00108000 08:01 26744286                   /lib/x86_64-linux-gnu/libm-2.23.so
7f6b1ad9e000-7f6b1ad9f000 r--p 00107000 08:01 26744286                   /lib/x86_64-linux-gnu/libm-2.23.so
7f6b1ad9f000-7f6b1ada0000 rw-p 00108000 08:01 26744286                   /lib/x86_64-linux-gnu/libm-2.23.so
7f6b1ada0000-7f6b1af60000 r-xp 00000000 08:01 26744291                   /lib/x86_64-linux-gnu/libc-2.23.so
7f6b1af60000-7f6b1b160000 ---p 001c0000 08:01 26744291                   /lib/x86_64-linux-gnu/libc-2.23.so
7f6b1b160000-7f6b1b164000 r--p 001c0000 08:01 26744291                   /lib/x86_64-linux-gnu/libc-2.23.so
7f6b1b164000-7f6b1b166000 rw-p 001c4000 08:01 26744291                   /lib/x86_64-linux-gnu/libc-2.23.so
7f6b1b166000-7f6b1b16a000 rw-p 00000000 00:00 0 
7f6b1b16a000-7f6b1b180000 r-xp 00000000 08:01 26739223                   /lib/x86_64-linux-gnu/libgcc_s.so.1
7f6b1b180000-7f6b1b37f000 ---p 00016000 08:01 26739223                   /lib/x86_64-linux-gnu/libgcc_s.so.1
7f6b1b37f000-7f6b1b380000 rw-p 00015000 08:01 26739223                   /lib/x86_64-linux-gnu/libgcc_s.so.1
7f6b1b380000-7f6b1b4f2000 r-xp 00000000 08:01 786552                     /usr/lib/x86_64-linux-gnu/libstdc++.so.6.0.21
7f6b1b4f2000-7f6b1b6f2000 ---p 00172000 08:01 786552                     /usr/lib/x86_64-linux-gnu/libstdc++.so.6.0.21
7f6b1b6f2000-7f6b1b6fc000 r--p 00172000 08:01 786552                     /usr/lib/x86_64-linux-gnu/libstdc++.so.6.0.21
7f6b1b6fc000-7f6b1b6fe000 rw-p 0017c000 08:01 786552                     /usr/lib/x86_64-linux-gnu/libstdc++.so.6.0.21
7f6b1b6fe000-7f6b1b702000 rw-p 00000000 00:00 0 
7f6b1b702000-7f6b1b705000 r-xp 00000000 08:01 26744280                   /lib/x86_64-linux-gnu/libdl-2.23.so
7f6b1b705000-7f6b1b904000 ---p 00003000 08:01 26744280                   /lib/x86_64-linux-gnu/libdl-2.23.so
7f6b1b904000-7f6b1b905000 r--p 00002000 08:01 26744280                   /lib/x86_64-linux-gnu/libdl-2.23.so
7f6b1b905000-7f6b1b906000 rw-p 00003000 08:01 26744280                   /lib/x86_64-linux-gnu/libdl-2.23.so
7f6b1b906000-7f6b1b91e000 r-xp 00000000 08:01 26744274                   /lib/x86_64-linux-gnu/libpthread-2.23.so
7f6b1b91e000-7f6b1bb1d000 ---p 00018000 08:01 26744274                   /lib/x86_64-linux-gnu/libpthread-2.23.so
7f6b1bb1d000-7f6b1bb1e000 r--p 00017000 08:01 26744274                   /lib/x86_64-linux-gnu/libpthread-2.23.so
7f6b1bb1e000-7f6b1bb1f000 rw-p 00018000 08:01 26744274                   /lib/x86_64-linux-gnu/libpthread-2.23.so
7f6b1bb1f000-7f6b1bb23000 rw-p 00000000 00:00 0 
7f6b1bb23000-7f6b1bb2a000 r-xp 00000000 08:01 26744277                   /lib/x86_64-linux-gnu/librt-2.23.so
7f6b1bb2a000-7f6b1bd29000 ---p 00007000 08:01 26744277                   /lib/x86_64-linux-gnu/librt-2.23.so
7f6b1bd29000-7f6b1bd2a000 r--p 00006000 08:01 26744277                   /lib/x86_64-linux-gnu/librt-2.23.so
7f6b1bd2a000-7f6b1bd2b000 rw-p 00007000 08:01 26744277                   /lib/x86_64-linux-gnu/librt-2.23.so
7f6b1bd2b000-7f6b1bd51000 r-xp 00000000 08:01 26743592                   /lib/x86_64-linux-gnu/ld-2.23.so
7f6b1bf32000-7f6b1bf39000 rw-p 00000000 00:00 0 
7f6b1bf4d000-7f6b1bf50000 rw-p 00000000 00:00 0 
7f6b1bf50000-7f6b1bf51000 r--p 00025000 08:01 26743592                   /lib/x86_64-linux-gnu/ld-2.23.so
7f6b1bf51000-7f6b1bf52000 rw-p 00026000 08:01 26743592                   /lib/x86_64-linux-gnu/ld-2.23.so
7f6b1bf52000-7f6b1bf53000 rw-p 00000000 00:00 0 
7ffce57ee000-7ffce580f000 rw-p 00000000 00:00 0                          [stack]
7ffce594f000-7ffce5951000 r--p 00000000 00:00 0                          [vvar]
7ffce5951000-7ffce5953000 r-xp 00000000 00:00 0                          [vdso]
ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0                  [vsyscall]
========= Error: process didn't terminate successfully
========= Internal error (20)
========= No CUDA-MEMCHECK results found

cuda-memcheck can find out-of-bounds accesses in your GPU code (device code). Your problem is in CPU code (host code). You have out-of-bounds write accesses somewhere that overwrite the control blocks of the memory allocator (presumably the sub-allocator in the C runtime library). When called, free() finds that the control block for a particular allocation has been corrupted.

Try valgrind, it’s a tool that can help you find out of bounds accesses and point out where in your code they occur. Or put in some elbow grease to track down the offending access(es) yourself.

Debugging is an important part of the software engineering skill set, and that skill is honed, first and foremost, by practice. My suggestion is to start practicing now using this example code. The essence of debugging is extracting information that helps you narrow down when and where the problem occurs. Just put your mind to it, and you’ll be able to figure it out. After all, people learned how to debug software long before the capability of asking random people on the internet for help existed.

Because I am working on a university server, I can’t install valgrind unfortunately, I am not admin.
And I don’t know how to utilize further the cuda-memcheck log info.
I changed all matrix_name variables to pointers and I allocated memory with malloc in the host code.
And for some big sizes, it wouldn’t let me continue, so I used new there for the allocation…
Until now I have found that the segfault occurs at the first cudaMalloc for some mysterious reason, I don’t know why.
Does it run out of memory?
The updated code is:

#define w(i,j) w[(i)*(InputN*hn) + (j)]
#define v(i,j) v[(i)*(hn*OutN) + (j)]
#define x_out(i,j) x_out[(i)*(InputN) + (j)]
#define y(i,j) y[(i)*(OutN) + (j)]
#define hn_out(i,j) hn_out[(i)*(hn) + (j)]
#define y_out(i,j) y_out[(i)*(OutN) + (j)]
#define y_delta(i,j) y_delta[(i)*(OutN) + (j)]
#define hn_delta(i,j) hn_delta[(i)*(hn) + (j)]
#define deltav(i,j) deltav[(i)*(hn*OutN) + (j)]
#define deltaw(i,j) deltaw[(i)*(InputN*hn) + (j)]

#include <stdio.h>
#include <math.h>
#include <stdlib.h>
#include <string>
#include "/home/user/include_files/cuda-8.0/include/cuda.h"
#include "/home/user/include_files/cuda-8.0/include/cuda_runtime.h"
#include "/home/user/include_files/cuda-8.0/include/cuda_runtime_api.h"

#define datanum 4 		// number of training samples
#define InputN 16		// number of neurons in the input layer
#define hn 64			// number of neurons in the hidden layer
#define OutN 1			// number of neurons in the output layer
#define threads_per_block 16
#define blocks 4
#define MAX_RAND 255
#define MIN_RAND 10

using namespace std;

// sigmoid serves as avtivation function
__device__ double sigmoid(double x){
	return(1.0 / (1.0 + exp(-x)));
}

__device__ int rand_kernel(int index, float *randData){
	float myrandf = randData[index];
	myrandf *= (MAX_RAND - MIN_RAND + 0.999999);
	myrandf += MIN_RAND;
	int myrand = (int)truncf(myrandf);
	return myrand;
}

__global__ void neural_network_kernel (float *randData, int *times, int *loop, double *error, double *max, double *min, double *x_out, double *hn_out, double *y_out, double *y, double *w, double *v, double *deltaw, double *deltav, double *hn_delta, double *y_delta, double *alpha, double *beta, double *sumtemp, double *errtemp)
{	
	int index = blockIdx.x * blockDim.x + threadIdx.x;
printf("\nHERE1!\n");
		
	struct{
		double input_kernel[InputN];
		double teach_kernel[OutN];
	}data_kernel[blocks*threads_per_block + datanum];
		
	for(int m=0; m<datanum; m++){
		for(int i=0; i<InputN; i++)
			data_kernel[threads_per_block + m].input_kernel[i] = (double)rand_kernel(index, randData)/32767.0;
		for(int i=0;i<OutN;i++)
			data_kernel[threads_per_block + m].teach_kernel[i] = (double)rand_kernel(index, randData)/32767.0;
	}
	
	__syncthreads();
		

	// Initialization
	for(int i=0; i<InputN; i++){
		for(int j=0; j<hn; j++){
			w(i,j) = ((double)rand_kernel(index, randData)/32767.0)*2-1;
			deltaw(i,j) = 0;
		}
	}
	for(int i=0; i<hn; i++){
		for(int j=0; j<OutN; j++){
			v(i,j) = ((double)rand_kernel(index, randData)/32767.0)*2-1;
			deltav(i,j) = 0;
		}
	}
	
printf("\nHERE3!\n");

while(loop[index] < *times){
		loop[index]++;
		error[index] = 0.0;
printf("\nHERE4!\n");	
		for(int m=0; m<datanum ; m++){
			// Feedforward
			max[index] = 0.0;
			min[index] = 0.0;
			for(int i=0; i<InputN; i++){
				x_out(index,i) = data_kernel[threads_per_block + m].input_kernel[i];
				if(max[index] < x_out(index,i))
					max[index] = x_out(index,i);
				if(min[index] > x_out(index,i))
					min[index] = x_out(index,i);
			}
			for(int i=0; i<InputN; i++){
				x_out(index,i) = (x_out(index,i) - min[index]) / (max[index] - min[index]);
			}

			for(int i=0; i<OutN ; i++){
				y(index,i) = data_kernel[threads_per_block + m].teach_kernel[i];
			}

			for(int i=0; i<hn; i++){
				sumtemp[index] = 0.0;
				for(int j=0; j<InputN; j++)
					sumtemp[index] += w(j,i) * x_out(index,j);
				hn_out(index,i) = sigmoid(sumtemp[index]);		// sigmoid serves as the activation function
			}

			for(int i=0; i<OutN; i++){
				sumtemp[index] = 0.0;
				for(int j=0; j<hn; j++)
					sumtemp[index] += v(j,i) * hn_out(index,j);
				y_out(index,i) = sigmoid(sumtemp[index]);
			}

			// Backpropagation
			for(int i=0; i<OutN; i++){
				errtemp[index] = y(index,i) - y_out(index,i);
				y_delta(index,i) = -errtemp[index] * sigmoid(y_out(index,i)) * (1.0 - sigmoid(y_out(index,i)));
				error[index] += errtemp[index] * errtemp[index];
			}

			for(int i=0; i<hn; i++){
				errtemp[index] = 0.0;
				for(int j=0; j<OutN; j++)
					errtemp[index] += y_delta(index,j) * v(i,j);
				hn_delta(index,i) = errtemp[index] * (1.0 + hn_out(index,i)) * (1.0 - hn_out(index,i));
			}

			// Stochastic gradient descent
			for(int i=0; i<OutN; i++){
				for(int j=0; j<hn; j++){
					deltav(j,i) = (*alpha) * deltav(j,i) + (*beta) * y_delta(index,i) * hn_out(index,j);
					v(j,i) -= deltav(j,i);
				}
			}

			for(int i=0; i<hn; i++){
				for(int j=0; j<InputN; j++){
					deltaw(j,i) = (*alpha) * deltaw(j,i) + (*beta) * hn_delta(index,i) * x_out(index,j);
					w(j,i) -= deltaw(j,i);
				}
			}
		}

		// Global error
		error[index] = error[index] / 2;
		
printf("\nEND WHILE!\n");
		//printf("The %d th training, error: %0.10f\n", loop[index], error[index]);
	}
}

int main(int argc, char *argv[]){
	printf("welcome1\n");	
	int times = 1000;
	
printf("welcome2\n");
printf("welcome3\n");
printf("welcome4\n");
printf("welcome5\n");
printf("welcome6\n");
printf("welcome7\n");
	
	double alpha = 0.1, beta = 0.1;

	printf("Line : main\n");

	srand (time(NULL));
	float *randData;
	randData = (float *)malloc(blocks*threads_per_block);
printf("welcome8\n");
	for (int i=0; i<blocks*threads_per_block; i++)
	{
		randData[i] = rand()%100;	//Else, without %100, it returns some billions for number!
	}
printf("welcome9\n");
	

	int loop = 0;
	double *error;
	printf("new1\n");
	error = (double *)malloc(blocks*threads_per_block);
		printf("new2\n");

	double *max, *min;
	max = (double *)malloc(blocks*threads_per_block);
		printf("new3\n");

	min = (double *)malloc(blocks*threads_per_block);
			printf("new4\n");

	/*for (int i=0; i<blocks*threads_per_block; i++)
	{
		error[i] = 0.0;
		max[i] = 0.0;
		min[i]= 0.0;
	}*/

	double *x_out;
printf("new5\n");

	x_out = (double *)malloc(blocks*threads_per_block*sizeof(double)*InputN);
	printf("new6\n");

	double *hn_out;
	hn_out = (double *)malloc(blocks*threads_per_block*sizeof(double)*hn);
	printf("new7\n");

	double *y_out;
	y_out = (double *)malloc(blocks*threads_per_block*sizeof(double)*OutN);
	printf("new8\n");

	double *y;
	y = (double *)malloc(blocks*threads_per_block*sizeof(double)*OutN);
printf("new9\n");

	double *hn_delta;
	hn_delta = (double *)malloc(blocks*threads_per_block*sizeof(double)*hn);
	printf("new10\n");

	double *y_delta;
	y_delta = (double *)malloc(blocks*threads_per_block*sizeof(double)*OutN);
	printf("new11\n");

	double *sumtemp;
	sumtemp = (double *)malloc(blocks*threads_per_block*sizeof(double));
	printf("new12\n");

	double *errtemp;
	errtemp = (double *)malloc(blocks*threads_per_block*sizeof(double));
	printf("new13\n");

	double *w = new double(blocks*threads_per_block*sizeof(double)*InputN*hn);	//use of new, because size is too long!
	printf("new14\n");

	double *v = new double(blocks*threads_per_block*sizeof(double)*hn*OutN);
	printf("new15\n");

	double *deltaw = new double(blocks*threads_per_block*sizeof(double)*InputN*hn);
	printf("new16\n");

	double *deltav = new double(blocks*threads_per_block*sizeof(double)*hn*OutN);

printf("new17\n");

	double *max_p_GPU, *min_p_GPU, *error_p_GPU;
	float *randData_p_GPU;
	int *times_p_GPU, *loop_p_GPU;
	double *x_out_p_GPU, *hn_out_p_GPU, *y_out_p_GPU, *y_p_GPU, *w_p_GPU, *v_p_GPU;
	double *deltaw_p_GPU, *deltav_p_GPU, *hn_delta_p_GPU;
	double *y_delta_p_GPU, *alpha_p_GPU, *beta_p_GPU, *sumtemp_p_GPU, *errtemp_p_GPU;

	//int blocks = times/threads_per_block;

printf("welcome10\n");	
	cudaMalloc((void **)&randData_p_GPU, blocks*threads_per_block*sizeof(float));
printf("DEBUG1\n");
	cudaMemcpy(randData_p_GPU, randData, blocks*threads_per_block*sizeof(float), cudaMemcpyHostToDevice);
printf("welcome11\n");
	cudaMalloc((void **)&times_p_GPU, sizeof(int));
printf("welcome12\n");
	cudaMemcpy(times_p_GPU, &times, sizeof(int), cudaMemcpyHostToDevice);
printf("welcome13\n");
	cudaMalloc((void **)&loop_p_GPU, sizeof(int));
printf("welcome14\n");
	cudaMemcpy(loop_p_GPU, &loop, sizeof(int), cudaMemcpyHostToDevice);
printf("welcome15\n");
	cudaMalloc((void **)&error_p_GPU, blocks*threads_per_block*sizeof(double));
printf("welcome16\n");
	cudaMemcpy(error_p_GPU, error, sizeof(double), cudaMemcpyHostToDevice);
printf("welcome17\n");
	cudaMalloc((void **)&max_p_GPU, blocks*threads_per_block*sizeof(double));
printf("welcome18\n");
	cudaMemcpy(max_p_GPU, max, blocks*threads_per_block*sizeof(double), cudaMemcpyHostToDevice);
printf("welcome19\n");
	cudaMalloc((void **)&min_p_GPU, blocks*threads_per_block*sizeof(double));
printf("welcome20\n");
	cudaMemcpy(min_p_GPU, min, blocks*threads_per_block*sizeof(double), cudaMemcpyHostToDevice);
printf("welcome21\n");
	
	cudaMalloc((void **)&x_out_p_GPU, blocks*threads_per_block*sizeof(double)*InputN);
printf("welcome22\n");
	cudaMemcpy(x_out_p_GPU, x_out, blocks*threads_per_block*sizeof(double)*InputN, cudaMemcpyHostToDevice);
printf("welcome23\n");
	cudaMalloc((void **)&hn_out_p_GPU, blocks*threads_per_block*sizeof(double)*hn);
printf("welcome24\n");
	cudaMemcpy(hn_out_p_GPU, hn_out, blocks*threads_per_block*sizeof(double)*hn, cudaMemcpyHostToDevice);
printf("welcome25\n");
	cudaMalloc((void **)&y_p_GPU, blocks*threads_per_block*sizeof(double)*OutN);
printf("welcome26\n");
	cudaMemcpy(y_p_GPU, y, blocks*threads_per_block*sizeof(double)*OutN, cudaMemcpyHostToDevice);
printf("welcome27\n");
	cudaMalloc((void **)&y_out_p_GPU, sizeof(double)*(threads_per_block*OutN));
	cudaMemcpy(y_out_p_GPU, y_out, sizeof(double)*OutN, cudaMemcpyHostToDevice);
	cudaMalloc((void **)&hn_delta_p_GPU, blocks*threads_per_block*sizeof(double)*hn);
printf("welcome28\n");
	cudaMemcpy(hn_delta_p_GPU, hn_delta, blocks*threads_per_block*sizeof(double)*hn, cudaMemcpyHostToDevice);
printf("welcome29\n");
	cudaMalloc((void **)&y_delta_p_GPU, blocks*threads_per_block*sizeof(double)*OutN);
printf("welcome30\n");
	cudaMemcpy(y_delta_p_GPU, y_delta, blocks*threads_per_block*sizeof(double)*OutN, cudaMemcpyHostToDevice);
printf("welcome31\n");
	
	cudaMalloc((void **)&alpha_p_GPU, sizeof(double));
	cudaMemcpy(alpha_p_GPU, &alpha, sizeof(double), cudaMemcpyHostToDevice);
	cudaMalloc((void **)&beta_p_GPU, sizeof(double));
	cudaMemcpy(beta_p_GPU, &beta, sizeof(double), cudaMemcpyHostToDevice);
	cudaMalloc((void **)&sumtemp_p_GPU, blocks*threads_per_block*sizeof(double));
	cudaMemcpy(sumtemp_p_GPU, sumtemp, blocks*threads_per_block*sizeof(double), cudaMemcpyHostToDevice);
	cudaMalloc((void **)&errtemp_p_GPU, blocks*threads_per_block*sizeof(double));
	cudaMemcpy(errtemp_p_GPU, errtemp, blocks*threads_per_block*sizeof(double), cudaMemcpyHostToDevice);
	cudaMalloc((void **)&w_p_GPU, blocks*threads_per_block*sizeof(double)*InputN*hn);
	cudaMemcpy(w_p_GPU, w, blocks*threads_per_block*sizeof(double)*(InputN*hn), cudaMemcpyHostToDevice);
	cudaMalloc((void **)&v_p_GPU, blocks*threads_per_block*sizeof(double)*hn*OutN);
	cudaMemcpy(v_p_GPU, v, blocks*threads_per_block*sizeof(double)*(hn*OutN), cudaMemcpyHostToDevice);
	cudaMalloc((void **)&deltaw_p_GPU, blocks*threads_per_block*sizeof(double)*InputN*hn);
	cudaMemcpy(deltaw_p_GPU, deltaw, blocks*threads_per_block*sizeof(double)*(InputN*hn), cudaMemcpyHostToDevice);
	cudaMalloc((void **)&deltav_p_GPU, blocks*threads_per_block*sizeof(double)*hn*OutN);
	cudaMemcpy(deltav_p_GPU, deltav, blocks*threads_per_block*sizeof(double)*(hn*OutN), cudaMemcpyHostToDevice);
	
printf("welcome40\n");

	neural_network_kernel<<<blocks, threads_per_block>>>(randData, times_p_GPU, loop_p_GPU, error_p_GPU, max_p_GPU, min_p_GPU, x_out_p_GPU, hn_out_p_GPU, y_out_p_GPU, y_p_GPU, w_p_GPU, v_p_GPU, deltaw_p_GPU, deltav_p_GPU, hn_delta_p_GPU, y_delta_p_GPU, alpha_p_GPU, beta_p_GPU, sumtemp_p_GPU, errtemp_p_GPU);

	delete[] v;
	delete[] w;
	delete[] deltav;
	delete[] deltaw;
printf("welcome41\n");
	
	cudaDeviceSynchronize();
printf("welcome_after_kernel\n");
}
  1. change the new operator statements back to malloc operations (this isn’t absolutely necessary, but the allocation sizes on your new statements are wrong)
  2. go through each malloc operation, and make sure you have scaled the allocation size correctly by the size of the type being allocated. You have about 4 of these which are incorrectly done, here is the first one:
float *randData;
	randData = (float *)malloc(blocks*threads_per_block);
printf("welcome8\n");
	for (int i=0; i<blocks*threads_per_block; i++)
	{
		randData[i] = rand()%100;	//Else, without %100, it returns some billions for number!
	}

change the malloc statement to:

randData = (float *)malloc(blocks*threads_per_block*sizeof(float));
                                                    ^^^^^^^^^^^^^

and do this appropriately for each instance

Thank you very much, the segfault doesn’t occur anymore!