inline __global__
void devAdaptWeights(
float *pNeuronsX, // neurons of layer x
float *pWeightsXY, // weights connecting X and X+1
float *pErrorY, // errors of layer x+1
int iSizeY, int iOffset, float fLearningRate) // nr of neurons in layer X+1 and (max) nr. of neurons of layer X+1(filled) or array offset
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
if( x >= iOffset )
return;
for(int z = 0; z < iSizeY; z++) {
pWeightsXY[x*iOffset+z] += fLearningRate*pNeuronsX[x]*pErrorY[z];
}
}
on gpu (gtx260) the code runs appr. 3-4 times faster in comparison with my i7 920.
not sure, wether shared memory makes sense here. but maybe i have to use another layout of my arrays.