Thanks striker159. I tried multiple ways. Following is the complete code that I am working with now. I tried to pass a two dimensional array to the Kernel, testNeurons(). Nothing really updates after the Kernel is launched.
#include <stdio.h>
#include <time.h>
#include <cuda_runtime.h>
#include <cassert>
#include <cstdlib>
#include <functional>
#include <iostream>
#include <algorithm>
#include <vector>
#define LEARNING_RATE 0.25
#define NUMB_OF_EPOCHS 1000000
#define TD_X 4 // training data in x- dimension
#define TD_Y 2 // training data in y- dimension
#define TD_Z 2 // training data in z- dimension
double TRAINING_DATA[TD_X][TD_Y][TD_Z] = {{{0,0},{0}},
{{0,1},{1}},
{{1,0},{1}},
{{1,1},{0}}};
double applyActivationFunction(double weightedSum) {
// activation function is a sigmoid function
return (1.0 / (1 + exp(-1.0 * weightedSum)));
}
void _setNeurons_(float *neurons[5]){
srand((long)time(NULL)); /* initialize rand() */
for (int i = 0; i < 2; i ++){
neurons[i][0] = 0.5 - (rand()/(double)RAND_MAX); // threshold
neurons[i][1] = 0.5 - (rand()/(double)RAND_MAX); // weight 1
neurons[i][2] = 0.5 - (rand()/(double)RAND_MAX); // weight 2
neurons[i][3] = 0.0; //output
neurons[i][4] = 0.0; //error
}
for (int i = 2; i < 4; i ++){
neurons[i][0] = 0.5 - (rand()/(double)RAND_MAX); // threshold
neurons[i][1] = 0.5 - (rand()/(double)RAND_MAX); // weight 1
neurons[i][2] = 0.5 - (rand()/(double)RAND_MAX); // weight 2
neurons[i][3] = 0.0; //output
neurons[i][4] = 0.0; //error
}
neurons[4][0] = 0.5 - (rand()/(double)RAND_MAX); // threshold
neurons[4][1] = 0.5 - (rand()/(double)RAND_MAX); // weight 1
neurons[4][2] = 0.5 - (rand()/(double)RAND_MAX); // weight 2
neurons[4][3] = 0.0; //output
neurons[4][4] = 0.0; //error
}
void _printTrainingData_(float *neurons[5]){
printf("[(I: %.2f), (I: %.2f), ", neurons[0][3], neurons[1][3]);
printf("(H: %.2f, %.2f, %.2f, %.5f), ", neurons[2][1], neurons[2][2], neurons[2][0], neurons[2][3]);
printf("(H: %.2f, %.2f, %.2f, %.5f), ", neurons[3][1], neurons[3][2], neurons[3][0], neurons[3][3]);
printf("(O: %.2f, %.2f, %.2f, %.5f)]\n ", neurons[4][1], neurons[4][2], neurons[4][0], neurons[4][3]);
}
void _forwardProp_(double input[], float *neurons[5], const int Nsize) {
double weightedSum = 0;
for( int i = 0; i < Nsize; i++){
switch (i) {
case 0: case 1: // input layer
neurons[i][3] = input[i];
break;
case 2: case 3: // hidden layer
weightedSum = neurons[i][0] +
neurons[i][1] * neurons[0][3] +
neurons[i][2] * neurons[1][3];
neurons[i][3] = applyActivationFunction(weightedSum);
break;
case 4: // output layer
weightedSum = neurons[i][0] +
neurons[i][1] * neurons[2][3] +
neurons[i][2] * neurons[3][3];
neurons[i][3] = applyActivationFunction(weightedSum);
break;
}
}
}
void _printResult_(double result[]) {
printf(" Input 1 | Input 2 | Target Result | Result \n");
printf("-------------------------------------------------------------\n");
for(int i = 0; i < 4; i++ ) {
for(int j = 0; j < 2; j++) {
printf(" %.5f |", TRAINING_DATA[i][0][j]);
}
printf(" %.5f | %.5f \n", TRAINING_DATA[i][1][0], result[i]);
}
}
void _printNetworkInfo_(){
// the number of inputs, hidden layers and output layers are set.
// the number of iterations and learning rate can be vary.
printf("Number of inputs: %d\n", 2);
printf("Number of hidden layers: %d\n", 2);
printf("Number of output: %d\n", 1);
printf("Number of iterations: %d\n", NUMB_OF_EPOCHS);
printf("Learning Rate: %.2f\n", LEARNING_RATE);
}
__global__ void testNeurons(float *Neurons[5]){
printf("start computing in GPU\n");
int i = blockIdx.x * blockDim.x + threadIdx.x;
Neurons[i][0] = 1.35; // update some numbers.
Neurons[i][1] = 3.46; // update some numbers.
Neurons[i][2] = 5.53; // update some numbers.
Neurons[i][3] = 2.34 ; // update some numbers.
printf("Neurons[%d][3] = %.2f\n", i, Neurons[i][3]);
printf("done updating in GPU\n");
}
int main(void){
// set up device
_printNetworkInfo_();
double result[] = {0, 0, 0, 0};
int N = 5; // number of neurons
int V = 5; // number of variables
size_t nBytes = V * sizeof(float);
// declare and initialize neurons as pointers
// malloc device global memory
float *neurons[V];
float *dev_neurons[V];
for(int i = 0; i < N; i++){
neurons[i] = (float *) malloc(nBytes);
cudaMalloc((void**) &dev_neurons[i], nBytes);
}
_setNeurons_(neurons); // initialize neurons
// transfer data from host to device
for(int i = 0; i < TD_X; i++) { // TD_X - Traning Data Dimension X
_forwardProp_(TRAINING_DATA[i][0], neurons, N);
result[i] = neurons[4][3]; // get output
}
_printResult_(result);
cudaMemcpy(dev_neurons, neurons, N * nBytes, cudaMemcpyHostToDevice);
// train network from CPU.
float GPUtime;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
testNeurons <<< 1, 5>>> (dev_neurons);
cudaDeviceSynchronize();
cudaMemcpy(neurons, dev_neurons, N * nBytes, cudaMemcpyDeviceToHost);
_printResult_(result);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&GPUtime, start, stop);
printf("Compute time on GPU: %3.6f ms \n", GPUtime);
cudaFree(dev_neurons);
return(1);
}