Parallelize code that uses struct having pointer to pointer type elements using CUDA

sakbar · November 11, 2013, 6:29am

If I have a code which takes struct variable as input and manipulate it’s elements, how can I parallelize this using CUDA?

void BackpropagateLayer(NET* Net, LAYER* Upper, LAYER* Lower)
{
INT i,j;
REAL Out, Err;

for (i=1; i<=Lower->Units; Out = Lower->Output[i];
Err = 0;
for (j=1; j<=Upper->Units; Err += Upper->Weight[j][i] }
Lower->Error[i] }
}
Where NET and LAYER typedef struct { INT Units; REAL Output; REAL Error; REAL* Weight; REAL* WeightSave; REAL* dWeight; } LAYER;
typedef struct { LAYER* Layer; LAYER InputLayer; LAYER REAL Alpha; REAL Eta; REAL Gain; REAL Error; } NET;
What I could think i++) {
j++) {
* Upper->Error[j];
= Net->Gain * Out * (1-Out) * Err;
are structs defined as:
/* A LAYER OF A NET: /
/ - number of units in this layer /
/* - output of ith unit /
/* - error term of ith unit /
/* - connection weights to ith unit /
/* - saved weights for stopped training /
/* - last weight deltas for momentum /
/ A NET: /
/* - layers of this net /
/* - input layer /
OutputLayer; /* - output layer /
/ - momentum factor /
/ - learning rate /
/ - gain of sigmoid function /
/ - total net error */
of is to first convert the 2d Weight into 1d. And then send it to kernel to take the product or just use the CUBLAS library. Any suggestions?

sakbar · November 13, 2013, 6:02am

I have come up with the following implementation so far.

global void calculatelowerError_dev(REAL* lowerError_dev, REAL* lowerOutput_dev, REAL nGain, REAL* Err_dev, INT lowerUnits){

int idx = blockDim.x * blockIdx.x + threadIdx.x;

if(idx < lowerUnits){
lowerError_dev[idx] = nGain * lowerOutput_dev[idx] * (1 - lowerOutput_dev[i]) * Err_dev[idx];
}
}

void BackpropagateLayer(NET* Net, LAYER* Upper, LAYER* Lower)
{
INT i,j;
REAL Out;
REAL* Err;
REAL nGain = Net->Gain;
INT lowerUnits = Lower->Units;
INT upperUnits = Upper->Units;
REAL* lowerOutput = Lower->Output;
REAL* upperError = Upper->Error;
REAL* lowerError = Lower->Error;
REAL* linearWeight;

REAL* linearWeight_dev;
REAL* lowerOutput_dev;
REAL* upperError_dev;
REAL* lowerError_dev;
REAL* Err_dev;

linearWeight = (REAL*) calloc(lowerUnits * upperUnits, sizeof(REAL));
Err = (REAL*) calloc(lowerUnits, sizeof(REAL));

for (i=1; i<=lowerUnits; i++){
for (j=1; j<=upperUnits; j++){
linearWeight[(i*upperUnits)+j] = Upper->Weight[j][i];
}
}

cudaMalloc((void**) &linearWeight_dev, sizeof(REAL) * lowerUnits * upperUnits);
cudaMalloc((void**) &upperError_dev, sizeof(REAL) * upperUnits);
cudaMalloc((void**) &Err_dev, sizeof(REAL) * lowerUnits);

cudaMemcpy(linearWeight_dev, linearWeight, sizeof(REAL) * lowerUnits * upperUnits, cudaMemcpyHostToDevice);
cudaMemcpy(upperError_dev, upperError, sizeof(REAL) * lowerUnits * upperUnits, cudaMemcpyHostToDevice);

for (i=1; i<=lowerUnits;i++){
cublasSdot(handle, upperUnits, linearWeight+i, 1, upperError_dev, 1, Err+i);
}

cudaMemcpy(Err_dev, Err, sizeof(REAL) * lowerUnits * upperUnits, cudaMemcpyHostToDevice);

cudaMalloc((void**) &lowerError_dev, sizeof(REAL) * lowerUnits);
threads_per_block = MAX_THREADS_PER_BLOCK;
nblocks = (lowerUnits + threads_per_block - 1) / threads_per_block;
calculatelowerError_dev<<< nblocks, threads_per_block>>>(lowerError_dev, lowerOutput_dev, nGain, Err_dev, lowerUnits);

cudaMemcpy(lowerError, lowerError_dev, sizeof(REAL) * lowerUnits, cudaMemcpyDeviceToHost);

}
The problem is that it’s giving me the access violation error at line while accessing linearWeight+i:

cublasSdot(handle, upperUnits, linearWeight+i, 1, upperError_dev, 1, Err+i);
Any suggestions? Can I really pass the address this way?

jgonzac · November 13, 2013, 10:00am

Are you sure that the error is at that line? You first should check errors for each cuda-function.
At first sight, you allocate sizeof(REAL) * upperUnits bytes at upperError

cudaMalloc((void**) &upperError_dev, <b>sizeof(REAL) * upperUnits</b>);

But later you copyHostToDevice sizeof(REAL) * lowerUnits * upperUnits bytes

cudaMemcpy(upperError_dev, upperError, <b>sizeof(REAL) * lowerUnits * upperUnits</b>, cudaMemcpyHostToDevice);

That is, at least, one error.

sakbar · November 13, 2013, 10:21am

Thanks. Yes I saw that mistake. Can you please check if I am doing the linearization correctly? because most of the serial code that I had looped starting from 1.

jgonzac · November 13, 2013, 10:57am

Loops have to start from 0. If not, you are allocating N bytes, where first position is 1 and the last is N. In C/C++ the last position of an array is N-1 because it starts at 0.
So,

for (i=0; i < N;++i)
   for (j=0; j < M;++j)
       A[i*M +j] = B[i][j] // or B[j][i], depends on how the matrix is allocated in 2D.