Neural Network (Backpropagation) implementation in CUDA

I tried to parallelize my (serial) neural network code, using CUDA. The output should be the error of the neural network and the runtime. I was based on the thought that I pass each argument as a pointer from main and correspond each thread with a data block inside these arrays. But the error for almost every thread isn’t changed from 0 to something else and the for loop, which has to be executed i.e. 1000 times , runs only for a few times. Also, the nonzero error are decades of digits long as number, not small i.e. 0.something.
All these result in showing, incorrectly, constantly zero output error.
I suspect that the correspondence threads-arrays_values is not that right or some wrong thread writes to a wrong variable or something… I know that this is too much, but if someone has the slightest suggestion/idea/solution that would help me to run the code properly and with the right output info, I would be very very very grateful, I am trying this for days and I am lost in logical debugging but I can’t seem to find any solution. I am sincerely desperate. Thank you very much in advance!

The code is (

#define w(i,j) w[(i)*(InputN*hn) + (j)]
#define v(i,j) v[(i)*(hn*OutN) + (j)]
#define x_out(i,j) x_out[(i)*(InputN) + (j)]
#define y(i,j) y[(i)*(OutN) + (j)]
#define hn_out(i,j) hn_out[(i)*(hn) + (j)]
#define y_out(i,j) y_out[(i)*(OutN) + (j)]
#define y_delta(i,j) y_delta[(i)*(OutN) + (j)]
#define hn_delta(i,j) hn_delta[(i)*(hn) + (j)]
#define deltav(i,j) deltav[(i)*(hn*OutN) + (j)]
#define deltaw(i,j) deltaw[(i)*(InputN*hn) + (j)]

#include <stdio.h>
#include <math.h>
#include <stdlib.h>
#include <string>
#include <time.h>
#include <sys/time.h>
#include <ctime>
#include "/home/user/include_files/cuda-8.0/include/cuda.h"
#include "/home/user/include_files/cuda-8.0/include/cuda_runtime.h"
#include "/home/user/include_files/cuda-8.0/include/cuda_runtime_api.h"

#define datanum 4 		// number of training samples
#define InputN 2		// number of neurons in the input layer
#define hn 4			// number of neurons in the hidden layer
#define OutN 1			// number of neurons in the output layer
#define threads_per_block 16
#define blocks 4
#define MAX_RAND 255
#define MIN_RAND 10

using namespace std;

__device__ double d_error[blocks*threads_per_block];

// sigmoid serves as avtivation function
__device__ double sigmoid(double x){
	return(1.0 / (1.0 + exp(-x)));

__device__ int rand_kernel(int index, float *randData){
        float myrandf = randData[index];
        myrandf *= (MAX_RAND - MIN_RAND + 0.999999);
        myrandf += MIN_RAND;
        int myrand = (int)truncf(myrandf);
        return myrand;

__global__ void neural_network_kernel (float *randData, int *times, int *loop, double *max, double *min, double *x_out, double *hn_out, double *y_out, double *y, double *w, double *v, double *deltaw, double *deltav, double *hn_delta, double *y_delta, double *alpha, double *beta, double *sumtemp, double *errtemp)
	int index = blockIdx.x * blockDim.x + threadIdx.x;

		double input_kernel[InputN];
		double teach_kernel[OutN];
	}data_kernel[blocks*threads_per_block + datanum];
	for(int m=0; m<datanum; m++){
		for(int i=0; i<InputN; i++)
			data_kernel[threads_per_block + m].input_kernel[i] = (double)rand_kernel(index, randData)/32767.0;
		for(int i=0;i<OutN;i++)
			data_kernel[threads_per_block + m].teach_kernel[i] = (double)rand_kernel(index, randData)/32767.0;
	// Initialization
	for(int i=0; i<InputN; i++){
		for(int j=0; j<hn; j++){
			w(i,j) = ((double)rand_kernel(index, randData)/32767.0)*2-1;
			deltaw(i,j) = 0;
	for(int i=0; i<hn; i++){
		for(int j=0; j<OutN; j++){
			v(i,j) = ((double)rand_kernel(index, randData)/32767.0)*2-1;
			deltav(i,j) = 0;

for(loop[index]=0; loop[index] < *times; loop[index]++){

		d_error[index] = 0.0;
		for(int m=0; m<datanum ; m++){
			// Feedforward
			max[index] = 0.0;
			min[index] = 0.0;
			for(int i=0; i<InputN; i++){
				x_out(index,i) = data_kernel[threads_per_block + m].input_kernel[i];
				if(max[index] < x_out(index,i))
					max[index] = x_out(index,i);
				if(min[index] > x_out(index,i))
					min[index] = x_out(index,i);

			for(int i=0; i<InputN; i++){
				x_out(index,i) = (x_out(index,i) - min[index]) / (max[index] - min[index]);

			for(int i=0; i<OutN ; i++){
				y(index,i) = data_kernel[threads_per_block + m].teach_kernel[i];

			for(int i=0; i<hn; i++){
				sumtemp[index] = 0.0;
				for(int j=0; j<InputN; j++)
					sumtemp[index] += w(j,i) * x_out(index,j);
				hn_out(index,i) = sigmoid(sumtemp[index]);		// sigmoid serves as the activation function

			for(int i=0; i<OutN; i++){
				sumtemp[index] = 0.0;
				for(int j=0; j<hn; j++)
					sumtemp[index] += v(j,i) * hn_out(index,j);
				y_out(index,i) = sigmoid(sumtemp[index]);

			// Backpropagation
			for(int i=0; i<OutN; i++){
				errtemp[index] = y(index,i) - y_out(index,i);
				y_delta(index,i) = -errtemp[index] * sigmoid(y_out(index,i)) * (1.0 - sigmoid(y_out(index,i)));
				d_error[index] += errtemp[index] * errtemp[index];
			printf("partial : %0.10f\n", d_error[index]);
			for(int i=0; i<hn; i++){
				errtemp[index] = 0.0;
				for(int j=0; j<OutN; j++)
					errtemp[index] += y_delta(index,j) * v(i,j);
				hn_delta(index,i) = errtemp[index] * (1.0 + hn_out(index,i)) * (1.0 - hn_out(index,i));

			// Stochastic gradient descent
			for(int i=0; i<OutN; i++){
				for(int j=0; j<hn; j++){
					deltav(j,i) = (*alpha) * deltav(j,i) + (*beta) * y_delta(index,i) * hn_out(index,j);
					v(j,i) -= deltav(j,i);

			for(int i=0; i<hn; i++){
				for(int j=0; j<InputN; j++){
					deltaw(j,i) = (*alpha) * deltaw(j,i) + (*beta) * hn_delta(index,i) * x_out(index,j);
					w(j,i) -= deltaw(j,i);
		// Global error
		d_error[index] = d_error[index] / 2.0;
		printf("\n\n\nThe %d th training, error: %0.10f\n\n\n", loop[index], d_error[index]);
		d_error[threads_per_block*blocks-1] = d_error[index];
		if (index==0)
			printf("Total Error :  %0.10f\n", d_error[threads_per_block*blocks-1]);

int main(int argc, char *argv[]){

int times = 1000;
	double alpha = 0.1, beta = 0.1;

	clock_t begin = clock();
	srand (time(NULL));
	float *randData;
	randData = (float *)malloc(blocks*threads_per_block*sizeof(float));
	for (int i=0; i<blocks*threads_per_block; i++)
		randData[i] = rand()%100;	//Else, without %100, it returns some billions for number!

	int *loop;
	loop = (int *)malloc(blocks*threads_per_block*sizeof(int));

	double *max, *min;
	max = (double *)malloc(blocks*threads_per_block*sizeof(double));

	min = (double *)malloc(blocks*threads_per_block*sizeof(double));

	/*for (int i=0; i<blocks*threads_per_block; i++)
		loop[i] = 0;
		error[i] = 0.0;
		max[i] = 0.0;
		min[i]= 0.0;

	double *x_out;
	x_out = (double *)malloc(blocks*threads_per_block*sizeof(double)*InputN);
	double *hn_out;
	hn_out = (double *)malloc(blocks*threads_per_block*sizeof(double)*hn);
	double *y_out;
	y_out = (double *)malloc(blocks*threads_per_block*sizeof(double)*OutN);
	double *y;
	y = (double *)malloc(blocks*threads_per_block*sizeof(double)*OutN);

	double *hn_delta;
	hn_delta = (double *)malloc(blocks*threads_per_block*sizeof(double)*hn);

	double *y_delta;
	y_delta = (double *)malloc(blocks*threads_per_block*sizeof(double)*OutN);

	double *sumtemp;
	sumtemp = (double *)malloc(blocks*threads_per_block*sizeof(double));

	double *errtemp;
	errtemp = (double *)malloc(blocks*threads_per_block*sizeof(double));

	double *w;
	w = (double *)malloc(blocks*threads_per_block*sizeof(double)*InputN*hn);	//use of new, because size is too long!

	double *v;
	v = (double *)malloc(blocks*threads_per_block*sizeof(double)*hn*OutN);

	double *deltaw;
	deltaw = (double *)malloc(blocks*threads_per_block*sizeof(double)*InputN*hn);

	double *deltav;
	deltav = (double *)malloc(blocks*threads_per_block*sizeof(double)*hn*OutN);

	double *max_p_GPU, *min_p_GPU;
	float *randData_p_GPU;
	int *times_p_GPU, *loop_p_GPU;
	double *x_out_p_GPU, *hn_out_p_GPU, *y_out_p_GPU, *y_p_GPU, *w_p_GPU, *v_p_GPU;
	double *deltaw_p_GPU, *deltav_p_GPU, *hn_delta_p_GPU;
	double *y_delta_p_GPU, *alpha_p_GPU, *beta_p_GPU, *sumtemp_p_GPU, *errtemp_p_GPU;

	cudaMalloc((void **)&randData_p_GPU, blocks*threads_per_block*sizeof(float));
	cudaMemcpy(randData_p_GPU, randData, blocks*threads_per_block*sizeof(float), cudaMemcpyHostToDevice);
	cudaMalloc((void **)&times_p_GPU, sizeof(int));
	cudaMemcpy(times_p_GPU, &times, sizeof(int), cudaMemcpyHostToDevice);
	cudaMalloc((void **)&loop_p_GPU, blocks*threads_per_block*sizeof(int));
	cudaMemcpy(loop_p_GPU, loop, blocks*threads_per_block*sizeof(int), cudaMemcpyHostToDevice);
	cudaMalloc((void **)&max_p_GPU, blocks*threads_per_block*sizeof(double));
	cudaMemcpy(max_p_GPU, max, blocks*threads_per_block*sizeof(double), cudaMemcpyHostToDevice);

	cudaMalloc((void **)&min_p_GPU, blocks*threads_per_block*sizeof(double));
	cudaMemcpy(min_p_GPU, min, blocks*threads_per_block*sizeof(double), cudaMemcpyHostToDevice);
	cudaMalloc((void **)&x_out_p_GPU, blocks*threads_per_block*sizeof(double)*InputN);
	cudaMemcpy(x_out_p_GPU, x_out, blocks*threads_per_block*sizeof(double)*InputN, cudaMemcpyHostToDevice);
	cudaMalloc((void **)&hn_out_p_GPU, blocks*threads_per_block*sizeof(double)*hn);
	cudaMemcpy(hn_out_p_GPU, hn_out, blocks*threads_per_block*sizeof(double)*hn, cudaMemcpyHostToDevice);
	cudaMalloc((void **)&y_p_GPU, blocks*threads_per_block*sizeof(double)*OutN);
	cudaMemcpy(y_p_GPU, y, blocks*threads_per_block*sizeof(double)*OutN, cudaMemcpyHostToDevice);

	cudaMalloc((void **)&y_out_p_GPU, sizeof(double)*(threads_per_block*OutN));
	cudaMemcpy(y_out_p_GPU, y_out, sizeof(double)*OutN, cudaMemcpyHostToDevice);
	cudaMalloc((void **)&hn_delta_p_GPU, blocks*threads_per_block*sizeof(double)*hn);
	cudaMemcpy(hn_delta_p_GPU, hn_delta, blocks*threads_per_block*sizeof(double)*hn, cudaMemcpyHostToDevice);
	cudaMalloc((void **)&y_delta_p_GPU, blocks*threads_per_block*sizeof(double)*OutN);
	cudaMemcpy(y_delta_p_GPU, y_delta, blocks*threads_per_block*sizeof(double)*OutN, cudaMemcpyHostToDevice);
	cudaMalloc((void **)&alpha_p_GPU, sizeof(double));
	cudaMemcpy(alpha_p_GPU, &alpha, sizeof(double), cudaMemcpyHostToDevice);
	cudaMalloc((void **)&beta_p_GPU, sizeof(double));
	cudaMemcpy(beta_p_GPU, &beta, sizeof(double), cudaMemcpyHostToDevice);
	cudaMalloc((void **)&sumtemp_p_GPU, blocks*threads_per_block*sizeof(double));
	cudaMemcpy(sumtemp_p_GPU, sumtemp, blocks*threads_per_block*sizeof(double), cudaMemcpyHostToDevice);
	cudaMalloc((void **)&errtemp_p_GPU, blocks*threads_per_block*sizeof(double));
	cudaMemcpy(errtemp_p_GPU, errtemp, blocks*threads_per_block*sizeof(double), cudaMemcpyHostToDevice);
	cudaMalloc((void **)&w_p_GPU, blocks*threads_per_block*sizeof(double)*InputN*hn);
	cudaMemcpy(w_p_GPU, w, blocks*threads_per_block*sizeof(double)*(InputN*hn), cudaMemcpyHostToDevice);
	cudaMalloc((void **)&v_p_GPU, blocks*threads_per_block*sizeof(double)*hn*OutN);
	cudaMemcpy(v_p_GPU, v, blocks*threads_per_block*sizeof(double)*(hn*OutN), cudaMemcpyHostToDevice);
	cudaMalloc((void **)&deltaw_p_GPU, blocks*threads_per_block*sizeof(double)*InputN*hn);
	cudaMemcpy(deltaw_p_GPU, deltaw, blocks*threads_per_block*sizeof(double)*(InputN*hn), cudaMemcpyHostToDevice);
	cudaMalloc((void **)&deltav_p_GPU, blocks*threads_per_block*sizeof(double)*hn*OutN);
	cudaMemcpy(deltav_p_GPU, deltav, blocks*threads_per_block*sizeof(double)*(hn*OutN), cudaMemcpyHostToDevice);

	neural_network_kernel<<<blocks, threads_per_block>>>(randData, times_p_GPU, loop_p_GPU, max_p_GPU, min_p_GPU, x_out_p_GPU, hn_out_p_GPU, y_out_p_GPU, y_p_GPU, w_p_GPU, v_p_GPU, deltaw_p_GPU, deltav_p_GPU, hn_delta_p_GPU, y_delta_p_GPU, alpha_p_GPU, beta_p_GPU, sumtemp_p_GPU, errtemp_p_GPU);
	typeof(d_error) error;
	cudaMemcpyFromSymbol(&error, "d_error", sizeof(error), 0, cudaMemcpyDeviceToHost);
	clock_t end = clock();
	double elapsed_time = double(end - begin) / CLOCKS_PER_SEC;
	double total_error = error[blocks*threads_per_block-1];
	printf("Execution Time of the Training is:		%0.10f seconds\n",elapsed_time);
	printf("Error of the Neural Network is:		%0.10f\n",total_error);