I tried to parallelize my (serial) neural network code, using CUDA. The output should be the error of the neural network and the runtime. I was based on the thought that I pass each argument as a pointer from main and correspond each thread with a data block inside these arrays. But the error for almost every thread isn’t changed from 0 to something else and the for loop, which has to be executed i.e. 1000 times , runs only for a few times. Also, the nonzero error are decades of digits long as number, not small i.e. 0.something.
All these result in showing, incorrectly, constantly zero output error.
I suspect that the correspondence threads-arrays_values is not that right or some wrong thread writes to a wrong variable or something… I know that this is too much, but if someone has the slightest suggestion/idea/solution that would help me to run the code properly and with the right output info, I would be very very very grateful, I am trying this for days and I am lost in logical debugging but I can’t seem to find any solution. I am sincerely desperate. Thank you very much in advance!
The code is (final.cu):
#define w(i,j) w[(i)*(InputN*hn) + (j)]
#define v(i,j) v[(i)*(hn*OutN) + (j)]
#define x_out(i,j) x_out[(i)*(InputN) + (j)]
#define y(i,j) y[(i)*(OutN) + (j)]
#define hn_out(i,j) hn_out[(i)*(hn) + (j)]
#define y_out(i,j) y_out[(i)*(OutN) + (j)]
#define y_delta(i,j) y_delta[(i)*(OutN) + (j)]
#define hn_delta(i,j) hn_delta[(i)*(hn) + (j)]
#define deltav(i,j) deltav[(i)*(hn*OutN) + (j)]
#define deltaw(i,j) deltaw[(i)*(InputN*hn) + (j)]
#include <stdio.h>
#include <math.h>
#include <stdlib.h>
#include <string>
#include <time.h>
#include <sys/time.h>
#include <ctime>
#include "/home/user/include_files/cuda-8.0/include/cuda.h"
#include "/home/user/include_files/cuda-8.0/include/cuda_runtime.h"
#include "/home/user/include_files/cuda-8.0/include/cuda_runtime_api.h"
#define datanum 4 // number of training samples
#define InputN 2 // number of neurons in the input layer
#define hn 4 // number of neurons in the hidden layer
#define OutN 1 // number of neurons in the output layer
#define threads_per_block 16
#define blocks 4
#define MAX_RAND 255
#define MIN_RAND 10
using namespace std;
__device__ double d_error[blocks*threads_per_block];
// sigmoid serves as avtivation function
__device__ double sigmoid(double x){
return(1.0 / (1.0 + exp(-x)));
}
__device__ int rand_kernel(int index, float *randData){
float myrandf = randData[index];
myrandf *= (MAX_RAND - MIN_RAND + 0.999999);
myrandf += MIN_RAND;
int myrand = (int)truncf(myrandf);
return myrand;
}
__global__ void neural_network_kernel (float *randData, int *times, int *loop, double *max, double *min, double *x_out, double *hn_out, double *y_out, double *y, double *w, double *v, double *deltaw, double *deltav, double *hn_delta, double *y_delta, double *alpha, double *beta, double *sumtemp, double *errtemp)
{
int index = blockIdx.x * blockDim.x + threadIdx.x;
struct{
double input_kernel[InputN];
double teach_kernel[OutN];
}data_kernel[blocks*threads_per_block + datanum];
for(int m=0; m<datanum; m++){
for(int i=0; i<InputN; i++)
data_kernel[threads_per_block + m].input_kernel[i] = (double)rand_kernel(index, randData)/32767.0;
for(int i=0;i<OutN;i++)
data_kernel[threads_per_block + m].teach_kernel[i] = (double)rand_kernel(index, randData)/32767.0;
}
__syncthreads();
// Initialization
for(int i=0; i<InputN; i++){
for(int j=0; j<hn; j++){
w(i,j) = ((double)rand_kernel(index, randData)/32767.0)*2-1;
deltaw(i,j) = 0;
}
}
for(int i=0; i<hn; i++){
for(int j=0; j<OutN; j++){
v(i,j) = ((double)rand_kernel(index, randData)/32767.0)*2-1;
deltav(i,j) = 0;
}
}
for(loop[index]=0; loop[index] < *times; loop[index]++){
d_error[index] = 0.0;
for(int m=0; m<datanum ; m++){
// Feedforward
max[index] = 0.0;
min[index] = 0.0;
for(int i=0; i<InputN; i++){
x_out(index,i) = data_kernel[threads_per_block + m].input_kernel[i];
if(max[index] < x_out(index,i))
max[index] = x_out(index,i);
if(min[index] > x_out(index,i))
min[index] = x_out(index,i);
}
for(int i=0; i<InputN; i++){
x_out(index,i) = (x_out(index,i) - min[index]) / (max[index] - min[index]);
}
for(int i=0; i<OutN ; i++){
y(index,i) = data_kernel[threads_per_block + m].teach_kernel[i];
}
for(int i=0; i<hn; i++){
sumtemp[index] = 0.0;
for(int j=0; j<InputN; j++)
sumtemp[index] += w(j,i) * x_out(index,j);
hn_out(index,i) = sigmoid(sumtemp[index]); // sigmoid serves as the activation function
}
for(int i=0; i<OutN; i++){
sumtemp[index] = 0.0;
for(int j=0; j<hn; j++)
sumtemp[index] += v(j,i) * hn_out(index,j);
y_out(index,i) = sigmoid(sumtemp[index]);
}
// Backpropagation
for(int i=0; i<OutN; i++){
errtemp[index] = y(index,i) - y_out(index,i);
y_delta(index,i) = -errtemp[index] * sigmoid(y_out(index,i)) * (1.0 - sigmoid(y_out(index,i)));
d_error[index] += errtemp[index] * errtemp[index];
}
printf("partial : %0.10f\n", d_error[index]);
for(int i=0; i<hn; i++){
errtemp[index] = 0.0;
for(int j=0; j<OutN; j++)
errtemp[index] += y_delta(index,j) * v(i,j);
hn_delta(index,i) = errtemp[index] * (1.0 + hn_out(index,i)) * (1.0 - hn_out(index,i));
}
// Stochastic gradient descent
for(int i=0; i<OutN; i++){
for(int j=0; j<hn; j++){
deltav(j,i) = (*alpha) * deltav(j,i) + (*beta) * y_delta(index,i) * hn_out(index,j);
v(j,i) -= deltav(j,i);
}
}
for(int i=0; i<hn; i++){
for(int j=0; j<InputN; j++){
deltaw(j,i) = (*alpha) * deltaw(j,i) + (*beta) * hn_delta(index,i) * x_out(index,j);
w(j,i) -= deltaw(j,i);
}
}
}
// Global error
d_error[index] = d_error[index] / 2.0;
printf("\n\n\nThe %d th training, error: %0.10f\n\n\n", loop[index], d_error[index]);
d_error[threads_per_block*blocks-1] = d_error[index];
if (index==0)
printf("Total Error : %0.10f\n", d_error[threads_per_block*blocks-1]);
}
}
int main(int argc, char *argv[]){
int times = 1000;
double alpha = 0.1, beta = 0.1;
clock_t begin = clock();
srand (time(NULL));
float *randData;
randData = (float *)malloc(blocks*threads_per_block*sizeof(float));
for (int i=0; i<blocks*threads_per_block; i++)
{
randData[i] = rand()%100; //Else, without %100, it returns some billions for number!
}
int *loop;
loop = (int *)malloc(blocks*threads_per_block*sizeof(int));
double *max, *min;
max = (double *)malloc(blocks*threads_per_block*sizeof(double));
min = (double *)malloc(blocks*threads_per_block*sizeof(double));
/*for (int i=0; i<blocks*threads_per_block; i++)
{
loop[i] = 0;
error[i] = 0.0;
max[i] = 0.0;
min[i]= 0.0;
}*/
double *x_out;
x_out = (double *)malloc(blocks*threads_per_block*sizeof(double)*InputN);
double *hn_out;
hn_out = (double *)malloc(blocks*threads_per_block*sizeof(double)*hn);
double *y_out;
y_out = (double *)malloc(blocks*threads_per_block*sizeof(double)*OutN);
double *y;
y = (double *)malloc(blocks*threads_per_block*sizeof(double)*OutN);
double *hn_delta;
hn_delta = (double *)malloc(blocks*threads_per_block*sizeof(double)*hn);
double *y_delta;
y_delta = (double *)malloc(blocks*threads_per_block*sizeof(double)*OutN);
printf("new11\n");
double *sumtemp;
sumtemp = (double *)malloc(blocks*threads_per_block*sizeof(double));
double *errtemp;
errtemp = (double *)malloc(blocks*threads_per_block*sizeof(double));
double *w;
w = (double *)malloc(blocks*threads_per_block*sizeof(double)*InputN*hn); //use of new, because size is too long!
double *v;
v = (double *)malloc(blocks*threads_per_block*sizeof(double)*hn*OutN);
double *deltaw;
deltaw = (double *)malloc(blocks*threads_per_block*sizeof(double)*InputN*hn);
double *deltav;
deltav = (double *)malloc(blocks*threads_per_block*sizeof(double)*hn*OutN);
double *max_p_GPU, *min_p_GPU;
float *randData_p_GPU;
int *times_p_GPU, *loop_p_GPU;
double *x_out_p_GPU, *hn_out_p_GPU, *y_out_p_GPU, *y_p_GPU, *w_p_GPU, *v_p_GPU;
double *deltaw_p_GPU, *deltav_p_GPU, *hn_delta_p_GPU;
double *y_delta_p_GPU, *alpha_p_GPU, *beta_p_GPU, *sumtemp_p_GPU, *errtemp_p_GPU;
cudaMalloc((void **)&randData_p_GPU, blocks*threads_per_block*sizeof(float));
cudaMemcpy(randData_p_GPU, randData, blocks*threads_per_block*sizeof(float), cudaMemcpyHostToDevice);
cudaMalloc((void **)×_p_GPU, sizeof(int));
cudaMemcpy(times_p_GPU, ×, sizeof(int), cudaMemcpyHostToDevice);
cudaMalloc((void **)&loop_p_GPU, blocks*threads_per_block*sizeof(int));
cudaMemcpy(loop_p_GPU, loop, blocks*threads_per_block*sizeof(int), cudaMemcpyHostToDevice);
cudaMalloc((void **)&max_p_GPU, blocks*threads_per_block*sizeof(double));
cudaMemcpy(max_p_GPU, max, blocks*threads_per_block*sizeof(double), cudaMemcpyHostToDevice);
cudaMalloc((void **)&min_p_GPU, blocks*threads_per_block*sizeof(double));
cudaMemcpy(min_p_GPU, min, blocks*threads_per_block*sizeof(double), cudaMemcpyHostToDevice);
cudaMalloc((void **)&x_out_p_GPU, blocks*threads_per_block*sizeof(double)*InputN);
cudaMemcpy(x_out_p_GPU, x_out, blocks*threads_per_block*sizeof(double)*InputN, cudaMemcpyHostToDevice);
cudaMalloc((void **)&hn_out_p_GPU, blocks*threads_per_block*sizeof(double)*hn);
cudaMemcpy(hn_out_p_GPU, hn_out, blocks*threads_per_block*sizeof(double)*hn, cudaMemcpyHostToDevice);
cudaMalloc((void **)&y_p_GPU, blocks*threads_per_block*sizeof(double)*OutN);
cudaMemcpy(y_p_GPU, y, blocks*threads_per_block*sizeof(double)*OutN, cudaMemcpyHostToDevice);
cudaMalloc((void **)&y_out_p_GPU, sizeof(double)*(threads_per_block*OutN));
cudaMemcpy(y_out_p_GPU, y_out, sizeof(double)*OutN, cudaMemcpyHostToDevice);
cudaMalloc((void **)&hn_delta_p_GPU, blocks*threads_per_block*sizeof(double)*hn);
cudaMemcpy(hn_delta_p_GPU, hn_delta, blocks*threads_per_block*sizeof(double)*hn, cudaMemcpyHostToDevice);
cudaMalloc((void **)&y_delta_p_GPU, blocks*threads_per_block*sizeof(double)*OutN);
cudaMemcpy(y_delta_p_GPU, y_delta, blocks*threads_per_block*sizeof(double)*OutN, cudaMemcpyHostToDevice);
cudaMalloc((void **)&alpha_p_GPU, sizeof(double));
cudaMemcpy(alpha_p_GPU, &alpha, sizeof(double), cudaMemcpyHostToDevice);
cudaMalloc((void **)&beta_p_GPU, sizeof(double));
cudaMemcpy(beta_p_GPU, &beta, sizeof(double), cudaMemcpyHostToDevice);
cudaMalloc((void **)&sumtemp_p_GPU, blocks*threads_per_block*sizeof(double));
cudaMemcpy(sumtemp_p_GPU, sumtemp, blocks*threads_per_block*sizeof(double), cudaMemcpyHostToDevice);
cudaMalloc((void **)&errtemp_p_GPU, blocks*threads_per_block*sizeof(double));
cudaMemcpy(errtemp_p_GPU, errtemp, blocks*threads_per_block*sizeof(double), cudaMemcpyHostToDevice);
cudaMalloc((void **)&w_p_GPU, blocks*threads_per_block*sizeof(double)*InputN*hn);
cudaMemcpy(w_p_GPU, w, blocks*threads_per_block*sizeof(double)*(InputN*hn), cudaMemcpyHostToDevice);
cudaMalloc((void **)&v_p_GPU, blocks*threads_per_block*sizeof(double)*hn*OutN);
cudaMemcpy(v_p_GPU, v, blocks*threads_per_block*sizeof(double)*(hn*OutN), cudaMemcpyHostToDevice);
cudaMalloc((void **)&deltaw_p_GPU, blocks*threads_per_block*sizeof(double)*InputN*hn);
cudaMemcpy(deltaw_p_GPU, deltaw, blocks*threads_per_block*sizeof(double)*(InputN*hn), cudaMemcpyHostToDevice);
cudaMalloc((void **)&deltav_p_GPU, blocks*threads_per_block*sizeof(double)*hn*OutN);
cudaMemcpy(deltav_p_GPU, deltav, blocks*threads_per_block*sizeof(double)*(hn*OutN), cudaMemcpyHostToDevice);
neural_network_kernel<<<blocks, threads_per_block>>>(randData, times_p_GPU, loop_p_GPU, max_p_GPU, min_p_GPU, x_out_p_GPU, hn_out_p_GPU, y_out_p_GPU, y_p_GPU, w_p_GPU, v_p_GPU, deltaw_p_GPU, deltav_p_GPU, hn_delta_p_GPU, y_delta_p_GPU, alpha_p_GPU, beta_p_GPU, sumtemp_p_GPU, errtemp_p_GPU);
cudaDeviceSynchronize();
typeof(d_error) error;
cudaMemcpyFromSymbol(&error, "d_error", sizeof(error), 0, cudaMemcpyDeviceToHost);
clock_t end = clock();
double elapsed_time = double(end - begin) / CLOCKS_PER_SEC;
double total_error = error[blocks*threads_per_block-1];
printf("Execution Time of the Training is: %0.10f seconds\n",elapsed_time);
printf("Error of the Neural Network is: %0.10f\n",total_error);
free(loop);
free(max);
free(min);
free(x_out);
free(hn_out);
free(y_out);
free(y);
free(hn_delta);
free(y_delta);
free(sumtemp);
free(errtemp);
free(w);
free(v);
free(deltav);
free(deltaw);
}