I want to parallelize my serial neural network code with CUDA, but there is a segmentation fault error (core dumped). My code is:
final.cu
#define w(i,j) w[(i)*(InputN*hn) + (j)]
#define v(i,j) v[(i)*(hn*OutN) + (j)]
#define x_out(i,j) x_out[(i)*(InputN) + (j)]
#define y(i,j) y[(i)*(OutN) + (j)]
#define hn_out(i,j) hn_out[(i)*(hn) + (j)]
#define y_out(i,j) y_out[(i)*(OutN) + (j)]
#define y_delta(i,j) y_delta[(i)*(OutN) + (j)]
#define hn_delta(i,j) hn_delta[(i)*(hn) + (j)]
#define deltav(i,j) deltav[(i)*(hn*OutN) + (j)]
#define deltaw(i,j) deltaw[(i)*(InputN*hn) + (j)]
#include <stdio.h>
#include <math.h>
#include <stdlib.h>
#include <string>
#include "/home/user/include_files/cuda-8.0/include/cuda.h"
#include "/home/user/include_files/cuda-8.0/include/cuda_runtime.h"
#include "/home/user/include_files/cuda-8.0/include/cuda_runtime_api.h"
#define datanum 4 // number of training samples
#define InputN 16 // number of neurons in the input layer
#define hn 64 // number of neurons in the hidden layer
#define OutN 1 // number of neurons in the output layer
#define threads_per_block 16
#define blocks 4
#define MAX_RAND 255
#define MIN_RAND 10
using namespace std;
__global__ void neural_network_kernel (float *randData, int *times, int *loop, double *error, double *max, double *min, double *x_out, double *hn_out, double *y_out, double *y, double *w, double *v, double *deltaw, double *deltav, double *hn_delta, double *y_delta, double *alpha, double *beta, double *sumtemp, double *errtemp);
int main(int argc, char *argv[]){
int times = 1000;
double alpha = 0.1, beta = 0.1;
srand (time(NULL));
float randData[blocks*threads_per_block];
for (int i=0; i<blocks*threads_per_block; i++)
{
randData[i] = rand()%100; //Else, without %100, it returns some billions for number!
}
int loop = 0;
double error[blocks*threads_per_block];
double max[blocks*threads_per_block], min[blocks*threads_per_block];
for (int i=0; i<blocks*threads_per_block; i++)
{
error[i] = 0.0;
max[i] = 0.0;
min[i]= 0.0;
}
double x_out[blocks*threads_per_block*sizeof(double)*InputN];
double hn_out[blocks*threads_per_block*sizeof(double)*hn];
double y_out[blocks*threads_per_block*sizeof(double)*OutN];
double y[blocks*threads_per_block*sizeof(double)*OutN];
double hn_delta[blocks*threads_per_block*sizeof(double)*hn];
double y_delta[blocks*threads_per_block*sizeof(double)*OutN];
double sumtemp[blocks*threads_per_block*sizeof(double)];
double errtemp[blocks*threads_per_block*sizeof(double)];
double w[blocks*threads_per_block*sizeof(double)*InputN*hn];
double v[blocks*threads_per_block*sizeof(double)*hn*OutN];
double deltaw[blocks*threads_per_block*sizeof(double)*InputN*hn];
double deltav[blocks*threads_per_block*sizeof(double)*hn*OutN];
double *max_p_GPU, *min_p_GPU, *error_p_GPU;
float *randData_p_GPU;
int *times_p_GPU, *loop_p_GPU;
double *x_out_p_GPU, *hn_out_p_GPU, *y_out_p_GPU, *y_p_GPU, *w_p_GPU, *v_p_GPU;
double *deltaw_p_GPU, *deltav_p_GPU, *hn_delta_p_GPU;
double *y_delta_p_GPU, *alpha_p_GPU, *beta_p_GPU, *sumtemp_p_GPU, *errtemp_p_GPU;
cudaMalloc((void **)&randData_p_GPU, blocks*threads_per_block*sizeof(float));
cudaMemcpy(randData_p_GPU, randData, blocks*threads_per_block*sizeof(float), cudaMemcpyHostToDevice);
cudaMalloc((void **)×_p_GPU, sizeof(int));
cudaMemcpy(times_p_GPU, ×, sizeof(int), cudaMemcpyHostToDevice);
cudaMalloc((void **)&loop_p_GPU, sizeof(int));
cudaMemcpy(loop_p_GPU, &loop, sizeof(int), cudaMemcpyHostToDevice);
cudaMalloc((void **)&error_p_GPU, blocks*threads_per_block*sizeof(double));
cudaMemcpy(error_p_GPU, error, sizeof(double), cudaMemcpyHostToDevice);
cudaMalloc((void **)&max_p_GPU, blocks*threads_per_block*sizeof(double));
cudaMemcpy(max_p_GPU, max, blocks*threads_per_block*sizeof(double), cudaMemcpyHostToDevice);
cudaMalloc((void **)&min_p_GPU, blocks*threads_per_block*sizeof(double));
cudaMemcpy(min_p_GPU, min, blocks*threads_per_block*sizeof(double), cudaMemcpyHostToDevice);
cudaMalloc((void **)&x_out_p_GPU, blocks*threads_per_block*sizeof(double)*InputN);
cudaMemcpy(x_out_p_GPU, x_out, blocks*threads_per_block*sizeof(double)*InputN, cudaMemcpyHostToDevice);
cudaMalloc((void **)&hn_out_p_GPU, blocks*threads_per_block*sizeof(double)*hn);
cudaMemcpy(hn_out_p_GPU, hn_out, blocks*threads_per_block*sizeof(double)*hn, cudaMemcpyHostToDevice);
cudaMalloc((void **)&y_p_GPU, blocks*threads_per_block*sizeof(double)*OutN);
cudaMemcpy(y_p_GPU, y, blocks*threads_per_block*sizeof(double)*OutN, cudaMemcpyHostToDevice);
cudaMalloc((void **)&y_out_p_GPU, sizeof(double)*(threads_per_block*OutN));
cudaMemcpy(y_out_p_GPU, y_out, sizeof(double)*OutN, cudaMemcpyHostToDevice);
cudaMalloc((void **)&hn_delta_p_GPU, blocks*threads_per_block*sizeof(double)*hn);
cudaMemcpy(hn_delta_p_GPU, hn_delta, blocks*threads_per_block*sizeof(double)*hn, cudaMemcpyHostToDevice);
cudaMalloc((void **)&y_delta_p_GPU, blocks*threads_per_block*sizeof(double)*OutN);
cudaMemcpy(y_delta_p_GPU, y_delta, blocks*threads_per_block*sizeof(double)*OutN, cudaMemcpyHostToDevice);
cudaMalloc((void **)&alpha_p_GPU, sizeof(double));
cudaMemcpy(alpha_p_GPU, &alpha, sizeof(double), cudaMemcpyHostToDevice);
cudaMalloc((void **)&beta_p_GPU, sizeof(double));
cudaMemcpy(beta_p_GPU, &beta, sizeof(double), cudaMemcpyHostToDevice);
cudaMalloc((void **)&sumtemp_p_GPU, blocks*threads_per_block*sizeof(double));
cudaMemcpy(sumtemp_p_GPU, sumtemp, blocks*threads_per_block*sizeof(double), cudaMemcpyHostToDevice);
cudaMalloc((void **)&errtemp_p_GPU, blocks*threads_per_block*sizeof(double));
cudaMemcpy(errtemp_p_GPU, errtemp, blocks*threads_per_block*sizeof(double), cudaMemcpyHostToDevice);
cudaMalloc((void **)&w_p_GPU, blocks*threads_per_block*sizeof(double)*InputN*hn);
cudaMemcpy(w_p_GPU, w, blocks*threads_per_block*sizeof(double)*(InputN*hn), cudaMemcpyHostToDevice);
cudaMalloc((void **)&v_p_GPU, blocks*threads_per_block*sizeof(double)*hn*OutN);
cudaMemcpy(v_p_GPU, v, blocks*threads_per_block*sizeof(double)*(hn*OutN), cudaMemcpyHostToDevice);
cudaMalloc((void **)&deltaw_p_GPU, blocks*threads_per_block*sizeof(double)*InputN*hn);
cudaMemcpy(deltaw_p_GPU, deltaw, blocks*threads_per_block*sizeof(double)*(InputN*hn), cudaMemcpyHostToDevice);
cudaMalloc((void **)&deltav_p_GPU, blocks*threads_per_block*sizeof(double)*hn*OutN);
cudaMemcpy(deltav_p_GPU, deltav, blocks*threads_per_block*sizeof(double)*(hn*OutN), cudaMemcpyHostToDevice);
neural_network_kernel<<<blocks, threads_per_block>>>(randData, times_p_GPU, loop_p_GPU, error_p_GPU, max_p_GPU, min_p_GPU, x_out_p_GPU, hn_out_p_GPU, y_out_p_GPU, y_p_GPU, w_p_GPU, v_p_GPU, deltaw_p_GPU, deltav_p_GPU, hn_delta_p_GPU, y_delta_p_GPU, alpha_p_GPU, beta_p_GPU, sumtemp_p_GPU, errtemp_p_GPU);
cudaDeviceSynchronize();
}
// sigmoid serves as avtivation function
__device__ double sigmoid(double x){
return(1.0 / (1.0 + exp(-x)));
}
__device__ int rand_kernel(int index, float *randData){
float myrandf = randData[index];
myrandf *= (MAX_RAND - MIN_RAND + 0.999999);
myrandf += MIN_RAND;
int myrand = (int)truncf(myrandf);
return myrand;
}
__global__ void neural_network_kernel (float *randData, int *times, int *loop, double *error, double *max, double *min, double *x_out, double *hn_out, double *y_out, double *y, double *w, double *v, double *deltaw, double *deltav, double *hn_delta, double *y_delta, double *alpha, double *beta, double *sumtemp, double *errtemp)
{
int index = blockIdx.x * blockDim.x + threadIdx.x;
struct{
double input_kernel[InputN];
double teach_kernel[OutN];
}data_kernel[blocks*threads_per_block + datanum];
for(int m=0; m<datanum; m++){
for(int i=0; i<InputN; i++)
data_kernel[threads_per_block + m].input_kernel[i] = (double)rand_kernel(index, randData)/32767.0;
for(int i=0;i<OutN;i++)
data_kernel[threads_per_block + m].teach_kernel[i] = (double)rand_kernel(index, randData)/32767.0;
}
__syncthreads();
// Initialization
for(int i=0; i<InputN; i++){
for(int j=0; j<hn; j++){
w(i,j) = ((double)rand_kernel(index, randData)/32767.0)*2-1;
deltaw(i,j) = 0;
}
}
for(int i=0; i<hn; i++){
for(int j=0; j<OutN; j++){
v(i,j) = ((double)rand_kernel(index, randData)/32767.0)*2-1;
deltav(i,j) = 0;
}
}
while(loop[index] < *times){
loop[index]++;
error[index] = 0.0;
for(int m=0; m<datanum ; m++){
// Feedforward
max[index] = 0.0;
min[index] = 0.0;
for(int i=0; i<InputN; i++){
x_out(index,i) = data_kernel[threads_per_block + m].input_kernel[i];
if(max[index] < x_out(index,i))
max[index] = x_out(index,i);
if(min[index] > x_out(index,i))
min[index] = x_out(index,i);
}
for(int i=0; i<InputN; i++){
x_out(index,i) = (x_out(index,i) - min[index]) / (max[index] - min[index]);
}
for(int i=0; i<OutN ; i++){
y(index,i) = data_kernel[threads_per_block + m].teach_kernel[i];
}
for(int i=0; i<hn; i++){
sumtemp[index] = 0.0;
for(int j=0; j<InputN; j++)
sumtemp[index] += w(j,i) * x_out(index,j);
hn_out(index,i) = sigmoid(sumtemp[index]); // sigmoid serves as the activation function
}
for(int i=0; i<OutN; i++){
sumtemp[index] = 0.0;
for(int j=0; j<hn; j++)
sumtemp[index] += v(j,i) * hn_out(index,j);
y_out(index,i) = sigmoid(sumtemp[index]);
}
// Backpropagation
for(int i=0; i<OutN; i++){
errtemp[index] = y(index,i) - y_out(index,i);
y_delta(index,i) = -errtemp[index] * sigmoid(y_out(index,i)) * (1.0 - sigmoid(y_out(index,i)));
error[index] += errtemp[index] * errtemp[index];
}
for(int i=0; i<hn; i++){
errtemp[index] = 0.0;
for(int j=0; j<OutN; j++)
errtemp[index] += y_delta(index,j) * v(i,j);
hn_delta(index,i) = errtemp[index] * (1.0 + hn_out(index,i)) * (1.0 - hn_out(index,i));
}
// Stochastic gradient descent
for(int i=0; i<OutN; i++){
for(int j=0; j<hn; j++){
deltav(j,i) = (*alpha) * deltav(j,i) + (*beta) * y_delta(index,i) * hn_out(index,j);
v(j,i) -= deltav(j,i);
}
}
for(int i=0; i<hn; i++){
for(int j=0; j<InputN; j++){
deltaw(j,i) = (*alpha) * deltaw(j,i) + (*beta) * hn_delta(index,i) * x_out(index,j);
w(j,i) -= deltaw(j,i);
}
}
}
// Global error
error[index] = error[index] / 2;
//printf("The %d th training, error: %0.10f\n", loop[index], error[index]);
}
}
Any help/suggestions/ideas would be extremely helpful, thank you in advance!