CUDA C++ Segmentation Fault

chester20080 · September 30, 2017, 5:47pm

I want to parallelize my serial neural network code with CUDA, but there is a segmentation fault error (core dumped). My code is:

final.cu

#define w(i,j) w[(i)*(InputN*hn) + (j)]
#define v(i,j) v[(i)*(hn*OutN) + (j)]
#define x_out(i,j) x_out[(i)*(InputN) + (j)]
#define y(i,j) y[(i)*(OutN) + (j)]
#define hn_out(i,j) hn_out[(i)*(hn) + (j)]
#define y_out(i,j) y_out[(i)*(OutN) + (j)]
#define y_delta(i,j) y_delta[(i)*(OutN) + (j)]
#define hn_delta(i,j) hn_delta[(i)*(hn) + (j)]
#define deltav(i,j) deltav[(i)*(hn*OutN) + (j)]
#define deltaw(i,j) deltaw[(i)*(InputN*hn) + (j)]

#include <stdio.h>
#include <math.h>
#include <stdlib.h>
#include <string>
#include "/home/user/include_files/cuda-8.0/include/cuda.h"
#include "/home/user/include_files/cuda-8.0/include/cuda_runtime.h"
#include "/home/user/include_files/cuda-8.0/include/cuda_runtime_api.h"

#define datanum 4 		// number of training samples
#define InputN 16		// number of neurons in the input layer
#define hn 64			// number of neurons in the hidden layer
#define OutN 1			// number of neurons in the output layer
#define threads_per_block 16
#define blocks 4
#define MAX_RAND 255
#define MIN_RAND 10


using namespace std;


__global__ void neural_network_kernel (float *randData, int *times, int *loop, double *error, double *max, double *min, double *x_out, double *hn_out, double *y_out, double *y, double *w, double *v, double *deltaw, double *deltav, double *hn_delta, double *y_delta, double *alpha, double *beta, double *sumtemp, double *errtemp);

int main(int argc, char *argv[]){
	int times = 1000;
	double alpha = 0.1, beta = 0.1;

	srand (time(NULL));
	float randData[blocks*threads_per_block];
	for (int i=0; i<blocks*threads_per_block; i++)
	{
		randData[i] = rand()%100;	//Else, without %100, it returns some billions for number!
	}

	int loop = 0;
	double error[blocks*threads_per_block];
	double max[blocks*threads_per_block], min[blocks*threads_per_block];
	for (int i=0; i<blocks*threads_per_block; i++)
	{
		error[i] = 0.0;
		max[i] = 0.0;
		min[i]= 0.0;
	}

	double x_out[blocks*threads_per_block*sizeof(double)*InputN];
	double hn_out[blocks*threads_per_block*sizeof(double)*hn];
	double y_out[blocks*threads_per_block*sizeof(double)*OutN];
	double y[blocks*threads_per_block*sizeof(double)*OutN];
	double hn_delta[blocks*threads_per_block*sizeof(double)*hn];
	double y_delta[blocks*threads_per_block*sizeof(double)*OutN];
	double sumtemp[blocks*threads_per_block*sizeof(double)];
	double errtemp[blocks*threads_per_block*sizeof(double)];
	double w[blocks*threads_per_block*sizeof(double)*InputN*hn];
	double v[blocks*threads_per_block*sizeof(double)*hn*OutN];
	double deltaw[blocks*threads_per_block*sizeof(double)*InputN*hn];
	double deltav[blocks*threads_per_block*sizeof(double)*hn*OutN];

	double *max_p_GPU, *min_p_GPU, *error_p_GPU;
	float *randData_p_GPU;
	int *times_p_GPU, *loop_p_GPU;
	double *x_out_p_GPU, *hn_out_p_GPU, *y_out_p_GPU, *y_p_GPU, *w_p_GPU, *v_p_GPU;
	double *deltaw_p_GPU, *deltav_p_GPU, *hn_delta_p_GPU;
	double *y_delta_p_GPU, *alpha_p_GPU, *beta_p_GPU, *sumtemp_p_GPU, *errtemp_p_GPU;

	
	cudaMalloc((void **)&randData_p_GPU, blocks*threads_per_block*sizeof(float));
	cudaMemcpy(randData_p_GPU, randData, blocks*threads_per_block*sizeof(float), cudaMemcpyHostToDevice);
	cudaMalloc((void **)&times_p_GPU, sizeof(int));
	cudaMemcpy(times_p_GPU, &times, sizeof(int), cudaMemcpyHostToDevice);
	cudaMalloc((void **)&loop_p_GPU, sizeof(int));
	cudaMemcpy(loop_p_GPU, &loop, sizeof(int), cudaMemcpyHostToDevice);
	cudaMalloc((void **)&error_p_GPU, blocks*threads_per_block*sizeof(double));
	cudaMemcpy(error_p_GPU, error, sizeof(double), cudaMemcpyHostToDevice);
	cudaMalloc((void **)&max_p_GPU, blocks*threads_per_block*sizeof(double));
	cudaMemcpy(max_p_GPU, max, blocks*threads_per_block*sizeof(double), cudaMemcpyHostToDevice);
	cudaMalloc((void **)&min_p_GPU, blocks*threads_per_block*sizeof(double));
	cudaMemcpy(min_p_GPU, min, blocks*threads_per_block*sizeof(double), cudaMemcpyHostToDevice);	
	cudaMalloc((void **)&x_out_p_GPU, blocks*threads_per_block*sizeof(double)*InputN);
	cudaMemcpy(x_out_p_GPU, x_out, blocks*threads_per_block*sizeof(double)*InputN, cudaMemcpyHostToDevice);
	cudaMalloc((void **)&hn_out_p_GPU, blocks*threads_per_block*sizeof(double)*hn);
	cudaMemcpy(hn_out_p_GPU, hn_out, blocks*threads_per_block*sizeof(double)*hn, cudaMemcpyHostToDevice);
	cudaMalloc((void **)&y_p_GPU, blocks*threads_per_block*sizeof(double)*OutN);
	cudaMemcpy(y_p_GPU, y, blocks*threads_per_block*sizeof(double)*OutN, cudaMemcpyHostToDevice);
	cudaMalloc((void **)&y_out_p_GPU, sizeof(double)*(threads_per_block*OutN));
	cudaMemcpy(y_out_p_GPU, y_out, sizeof(double)*OutN, cudaMemcpyHostToDevice);
	cudaMalloc((void **)&hn_delta_p_GPU, blocks*threads_per_block*sizeof(double)*hn);
	cudaMemcpy(hn_delta_p_GPU, hn_delta, blocks*threads_per_block*sizeof(double)*hn, cudaMemcpyHostToDevice);
	cudaMalloc((void **)&y_delta_p_GPU, blocks*threads_per_block*sizeof(double)*OutN);
	cudaMemcpy(y_delta_p_GPU, y_delta, blocks*threads_per_block*sizeof(double)*OutN, cudaMemcpyHostToDevice);
	cudaMalloc((void **)&alpha_p_GPU, sizeof(double));
	cudaMemcpy(alpha_p_GPU, &alpha, sizeof(double), cudaMemcpyHostToDevice);
	cudaMalloc((void **)&beta_p_GPU, sizeof(double));
	cudaMemcpy(beta_p_GPU, &beta, sizeof(double), cudaMemcpyHostToDevice);
	cudaMalloc((void **)&sumtemp_p_GPU, blocks*threads_per_block*sizeof(double));
	cudaMemcpy(sumtemp_p_GPU, sumtemp, blocks*threads_per_block*sizeof(double), cudaMemcpyHostToDevice);
	cudaMalloc((void **)&errtemp_p_GPU, blocks*threads_per_block*sizeof(double));
	cudaMemcpy(errtemp_p_GPU, errtemp, blocks*threads_per_block*sizeof(double), cudaMemcpyHostToDevice);
	cudaMalloc((void **)&w_p_GPU, blocks*threads_per_block*sizeof(double)*InputN*hn);
	cudaMemcpy(w_p_GPU, w, blocks*threads_per_block*sizeof(double)*(InputN*hn), cudaMemcpyHostToDevice);
	cudaMalloc((void **)&v_p_GPU, blocks*threads_per_block*sizeof(double)*hn*OutN);
	cudaMemcpy(v_p_GPU, v, blocks*threads_per_block*sizeof(double)*(hn*OutN), cudaMemcpyHostToDevice);
	cudaMalloc((void **)&deltaw_p_GPU, blocks*threads_per_block*sizeof(double)*InputN*hn);
	cudaMemcpy(deltaw_p_GPU, deltaw, blocks*threads_per_block*sizeof(double)*(InputN*hn), cudaMemcpyHostToDevice);
	cudaMalloc((void **)&deltav_p_GPU, blocks*threads_per_block*sizeof(double)*hn*OutN);
	cudaMemcpy(deltav_p_GPU, deltav, blocks*threads_per_block*sizeof(double)*(hn*OutN), cudaMemcpyHostToDevice);

	neural_network_kernel<<<blocks, threads_per_block>>>(randData, times_p_GPU, loop_p_GPU, error_p_GPU, max_p_GPU, min_p_GPU, x_out_p_GPU, hn_out_p_GPU, y_out_p_GPU, y_p_GPU, w_p_GPU, v_p_GPU, deltaw_p_GPU, deltav_p_GPU, hn_delta_p_GPU, y_delta_p_GPU, alpha_p_GPU, beta_p_GPU, sumtemp_p_GPU, errtemp_p_GPU);

	cudaDeviceSynchronize();
}



// sigmoid serves as avtivation function
__device__ double sigmoid(double x){
	return(1.0 / (1.0 + exp(-x)));
}


__device__ int rand_kernel(int index, float *randData){
	float myrandf = randData[index];
	myrandf *= (MAX_RAND - MIN_RAND + 0.999999);
	myrandf += MIN_RAND;
	int myrand = (int)truncf(myrandf);
	return myrand;
}


__global__ void neural_network_kernel (float *randData, int *times, int *loop, double *error, double *max, double *min, double *x_out, double *hn_out, double *y_out, double *y, double *w, double *v, double *deltaw, double *deltav, double *hn_delta, double *y_delta, double *alpha, double *beta, double *sumtemp, double *errtemp)
{	
	int index = blockIdx.x * blockDim.x + threadIdx.x;
	
	struct{
		double input_kernel[InputN];
		double teach_kernel[OutN];
	}data_kernel[blocks*threads_per_block + datanum];
		
	for(int m=0; m<datanum; m++){
		for(int i=0; i<InputN; i++)
			data_kernel[threads_per_block + m].input_kernel[i] = (double)rand_kernel(index, randData)/32767.0;
		for(int i=0;i<OutN;i++)
			data_kernel[threads_per_block + m].teach_kernel[i] = (double)rand_kernel(index, randData)/32767.0;
	}
	
	__syncthreads();
		

	// Initialization
	for(int i=0; i<InputN; i++){
		for(int j=0; j<hn; j++){
			w(i,j) = ((double)rand_kernel(index, randData)/32767.0)*2-1;
			deltaw(i,j) = 0;
		}
	}
	for(int i=0; i<hn; i++){
		for(int j=0; j<OutN; j++){
			v(i,j) = ((double)rand_kernel(index, randData)/32767.0)*2-1;
			deltav(i,j) = 0;
		}
	}
	
while(loop[index] < *times){
		loop[index]++;
		error[index] = 0.0;
		for(int m=0; m<datanum ; m++){
			// Feedforward
			max[index] = 0.0;
			min[index] = 0.0;
			for(int i=0; i<InputN; i++){
				x_out(index,i) = data_kernel[threads_per_block + m].input_kernel[i];
				if(max[index] < x_out(index,i))
					max[index] = x_out(index,i);
				if(min[index] > x_out(index,i))
					min[index] = x_out(index,i);
			}
			for(int i=0; i<InputN; i++){
				x_out(index,i) = (x_out(index,i) - min[index]) / (max[index] - min[index]);
			}

			for(int i=0; i<OutN ; i++){
				y(index,i) = data_kernel[threads_per_block + m].teach_kernel[i];
			}

			for(int i=0; i<hn; i++){
				sumtemp[index] = 0.0;
				for(int j=0; j<InputN; j++)
					sumtemp[index] += w(j,i) * x_out(index,j);
				hn_out(index,i) = sigmoid(sumtemp[index]);		// sigmoid serves as the activation function
			}

			for(int i=0; i<OutN; i++){
				sumtemp[index] = 0.0;
				for(int j=0; j<hn; j++)
					sumtemp[index] += v(j,i) * hn_out(index,j);
				y_out(index,i) = sigmoid(sumtemp[index]);
			}

			// Backpropagation
			for(int i=0; i<OutN; i++){
				errtemp[index] = y(index,i) - y_out(index,i);
				y_delta(index,i) = -errtemp[index] * sigmoid(y_out(index,i)) * (1.0 - sigmoid(y_out(index,i)));
				error[index] += errtemp[index] * errtemp[index];
			}

			for(int i=0; i<hn; i++){
				errtemp[index] = 0.0;
				for(int j=0; j<OutN; j++)
					errtemp[index] += y_delta(index,j) * v(i,j);
				hn_delta(index,i) = errtemp[index] * (1.0 + hn_out(index,i)) * (1.0 - hn_out(index,i));
			}

			// Stochastic gradient descent
			for(int i=0; i<OutN; i++){
				for(int j=0; j<hn; j++){
					deltav(j,i) = (*alpha) * deltav(j,i) + (*beta) * y_delta(index,i) * hn_out(index,j);
					v(j,i) -= deltav(j,i);
				}
			}

			for(int i=0; i<hn; i++){
				for(int j=0; j<InputN; j++){
					deltaw(j,i) = (*alpha) * deltaw(j,i) + (*beta) * hn_delta(index,i) * x_out(index,j);
					w(j,i) -= deltaw(j,i);
				}
			}
		}

		// Global error
		error[index] = error[index] / 2;
		
		//printf("The %d th training, error: %0.10f\n", loop[index], error[index]);
	}
}

Robert_Crovella · September 30, 2017, 6:36pm

when you are allocating an array, you don’t need to include sizeof(type) like you have here:

double x_out[blocks*threads_per_block*sizeof(double)*InputN];
                                      ^^^^^^^^^^^^^^

it should be sufficient to do this:

double x_out[blocks*threads_per_block*InputN];

(and the same for all others like this)

you have too much stack data (this is what is actually causing the seg fault). Either move these definitions outside of main, or convert them to heap-based allocations. For example convert this:

double deltaw[blocks*threads_per_block*sizeof(double)*InputN*hn];

to this:

double *deltaw = (double *)malloc(blocks*threads_per_block*sizeof(double)*InputN*hn);

note that here you do need the sizeof(type). And repeat for all other allocations like this in main.

Topic		Replies	Views
CUDA C++ Segmentation Fault CUDA Programming and Performance	7	14635	October 1, 2017
Neural Network (Backpropagation) implementation in CUDA CUDA Programming and Performance	0	1700	October 1, 2017
My kernel functions aren't recognized CUDA Programming and Performance	5	1875	April 16, 2013
Segmentation Fault CUDA Programming and Performance	9	3490	June 14, 2011
Cuda code performance CUDA Programming and Performance	14	3118	December 16, 2014
Cluster siliarity calculated with usage of array reduction optimization CUDA Programming and Performance cuda , kernel	9	339	December 3, 2022
Cuda application crashes works fine for small data and crashes for big data CUDA Developer Tools	0	360	December 8, 2020
Mysteriously changing shared memory variable Race condition? Overwriting an array? CUDA Programming and Performance	12	5913	November 25, 2011
matrix multiply reduction CUDA Programming and Performance	41	35540	January 15, 2011
cuda thrust::sort_by_key system error CUDA Programming and Performance	8	1647	October 24, 2017

CUDA C++ Segmentation Fault

Related topics