I’m creating a neural network with 3 layers (input, hidden and output) and biases in CUDA C, but it can’t learn.

The biases are summed after all weights in the activation function, e.g.: σ(w[0][0]*input[0]+w[0][1]*input[1]+…+bias[0]).

If I set weights and biases manually it calculate XOR with no error.

I tried various activation functions, so I think the error isn’t it.

Here is the kernel which update weights between the input layer and the hidden layer:

```
__global__ void updateWeightsInput(
float* __restrict__ weightsInput,
const float learningRate,
const float* __restrict__ input,
const float* __restrict__ derivativeHiddenLayer,
const float* __restrict__ activatedOutputLayer,
const float* __restrict__ derivativeOutputLayer,
const float* __restrict__ targetOutput,
const unsigned short inputNodes,
const unsigned short hiddenNodes,
const unsigned short outputNodes
) {
const unsigned short row = blockIdx.x * blockDim.x + threadIdx.x;
const unsigned short column = blockIdx.y * blockDim.y + threadIdx.y;
if (row < hiddenNodes && column < inputNodes) {
float sum1 = 0, sum2 = 0;
for (unsigned short i = 0; i < outputNodes; ++i) {
sum1 += derivativeOutputLayer[i] * weightsInput[row * inputNodes + i] * (activatedOutputLayer[i] - targetOutput[i]);
}
for (unsigned short i = 0; i < inputNodes; ++i) {
sum2 += input[i] * weightsInput[i * inputNodes + column];
}
weightsInput[row * inputNodes + column] -= learningRate * input[column] * sum1 * (sigmoid(sum2) * (1 - sigmoid(sum2)));
}
}
```

Here is the kernel which update weights between the hidden layer and the output layer:

```
__global__ void updateWeightsHidden(
float* __restrict__ weightsHidden,
const float learningRate,
const float* __restrict__ activatedHiddenLayer,
const float* __restrict__ derivativeHiddenLayer,
const float* __restrict__ activatedOutputLayer,
const float* __restrict__ targetOutput,
const unsigned short matrixRows,
const unsigned short matrixColumns
) {
const unsigned short row = blockIdx.x * blockDim.x + threadIdx.x;
const unsigned short column = blockIdx.y * blockDim.y + threadIdx.y;
if (row < matrixRows && column < matrixColumns) {
weightsHidden[row * matrixColumns + column] -= learningRate * activatedHiddenLayer[column] * derivativeHiddenLayer[row] * (activatedOutputLayer[row] - targetOutput[row]);
}
}
```

If the full code is necessary here it is.