Parallelize code that uses struct having pointer to pointer type elements using CUDA

If I have a code which takes struct variable as input and manipulate it’s elements, how can I parallelize this using CUDA?

void BackpropagateLayer(NET* Net, LAYER* Upper, LAYER* Lower)
{
INT i,j;
REAL Out, Err;

for (i=1; i<=Lower->Units; i++) {
Out = Lower->Output[i];
Err = 0;
for (j=1; j<=Upper->Units; j++) {
Err += Upper->Weight[j][i] * Upper->Error[j];
}
Lower->Error[i] = Net->Gain * Out * (1-Out) * Err;
}
}
Where NET and LAYER are structs defined as:
typedef struct { /* A LAYER OF A NET: /
INT Units; /
- number of units in this layer /
REAL
Output; /* - output of ith unit /
REAL
Error; /* - error term of ith unit /
REAL
* Weight; /* - connection weights to ith unit /
REAL
* WeightSave; /* - saved weights for stopped training /
REAL
* dWeight; /* - last weight deltas for momentum /
} LAYER;
typedef struct { /
A NET: /
LAYER
* Layer; /* - layers of this net /
LAYER
InputLayer; /* - input layer /
LAYER
OutputLayer; /* - output layer /
REAL Alpha; /
- momentum factor /
REAL Eta; /
- learning rate /
REAL Gain; /
- gain of sigmoid function /
REAL Error; /
- total net error */
} NET;
What I could think of is to first convert the 2d Weight into 1d. And then send it to kernel to take the product or just use the CUBLAS library. Any suggestions?

I have come up with the following implementation so far.

global void calculatelowerError_dev(REAL* lowerError_dev, REAL* lowerOutput_dev, REAL nGain, REAL* Err_dev, INT lowerUnits){

int idx = blockDim.x * blockIdx.x + threadIdx.x;

if(idx < lowerUnits){
lowerError_dev[idx] = nGain * lowerOutput_dev[idx] * (1 - lowerOutput_dev[i]) * Err_dev[idx];
}
}

void BackpropagateLayer(NET* Net, LAYER* Upper, LAYER* Lower)
{
INT i,j;
REAL Out;
REAL* Err;
REAL nGain = Net->Gain;
INT lowerUnits = Lower->Units;
INT upperUnits = Upper->Units;
REAL* lowerOutput = Lower->Output;
REAL* upperError = Upper->Error;
REAL* lowerError = Lower->Error;
REAL* linearWeight;

REAL* linearWeight_dev;
REAL* lowerOutput_dev;
REAL* upperError_dev;
REAL* lowerError_dev;
REAL* Err_dev;

linearWeight = (REAL*) calloc(lowerUnits * upperUnits, sizeof(REAL));
Err = (REAL*) calloc(lowerUnits, sizeof(REAL));

for (i=1; i<=lowerUnits; i++){
for (j=1; j<=upperUnits; j++){
linearWeight[(i*upperUnits)+j] = Upper->Weight[j][i];
}
}

cudaMalloc((void**) &linearWeight_dev, sizeof(REAL) * lowerUnits * upperUnits);
cudaMalloc((void**) &upperError_dev, sizeof(REAL) * upperUnits);
cudaMalloc((void**) &Err_dev, sizeof(REAL) * lowerUnits);

cudaMemcpy(linearWeight_dev, linearWeight, sizeof(REAL) * lowerUnits * upperUnits, cudaMemcpyHostToDevice);
cudaMemcpy(upperError_dev, upperError, sizeof(REAL) * lowerUnits * upperUnits, cudaMemcpyHostToDevice);

for (i=1; i<=lowerUnits;i++){
cublasSdot(handle, upperUnits, linearWeight+i, 1, upperError_dev, 1, Err+i);
}

cudaMemcpy(Err_dev, Err, sizeof(REAL) * lowerUnits * upperUnits, cudaMemcpyHostToDevice);

cudaMalloc((void**) &lowerError_dev, sizeof(REAL) * lowerUnits);
threads_per_block = MAX_THREADS_PER_BLOCK;
nblocks = (lowerUnits + threads_per_block - 1) / threads_per_block;
calculatelowerError_dev<<< nblocks, threads_per_block>>>(lowerError_dev, lowerOutput_dev, nGain, Err_dev, lowerUnits);

cudaMemcpy(lowerError, lowerError_dev, sizeof(REAL) * lowerUnits, cudaMemcpyDeviceToHost);

}
The problem is that it’s giving me the access violation error at line while accessing linearWeight+i:

cublasSdot(handle, upperUnits, linearWeight+i, 1, upperError_dev, 1, Err+i);
Any suggestions? Can I really pass the address this way?

Are you sure that the error is at that line? You first should check errors for each cuda-function.
At first sight, you allocate sizeof(REAL) * upperUnits bytes at upperError

cudaMalloc((void**) &upperError_dev, <b>sizeof(REAL) * upperUnits</b>);

But later you copyHostToDevice sizeof(REAL) * lowerUnits * upperUnits bytes

cudaMemcpy(upperError_dev, upperError, <b>sizeof(REAL) * lowerUnits * upperUnits</b>, cudaMemcpyHostToDevice);

That is, at least, one error.

Thanks. Yes I saw that mistake. Can you please check if I am doing the linearization correctly? because most of the serial code that I had looped starting from 1.

Loops have to start from 0. If not, you are allocating N bytes, where first position is 1 and the last is N. In C/C++ the last position of an array is N-1 because it starts at 0.
So,

for (i=0; i < N;++i)
   for (j=0; j < M;++j)
       A[i*M +j] = B[i][j] // or B[j][i], depends on how the matrix is allocated in 2D.