What is wrong with my updating weights functions

I’m creating a neural network with 3 layers (input, hidden and output) and biases in CUDA C, but it can’t learn.
The biases are summed after all weights in the activation function, e.g.: σ(w[0][0]*input[0]+w[0][1]*input[1]+…+bias[0]).
If I set weights and biases manually it calculate XOR with no error.
I tried various activation functions, so I think the error isn’t it.

Here is the kernel which update weights between the input layer and the hidden layer:

__global__ void updateWeightsInput(
	float* __restrict__ weightsInput,
	const float learningRate,
	const float* __restrict__ input,
	const float* __restrict__ derivativeHiddenLayer,
	const float* __restrict__ activatedOutputLayer,
	const float* __restrict__ derivativeOutputLayer,
	const float* __restrict__ targetOutput,
	const unsigned short inputNodes,
	const unsigned short hiddenNodes,
	const unsigned short outputNodes
) {
	const unsigned short row    = blockIdx.x * blockDim.x + threadIdx.x;
	const unsigned short column = blockIdx.y * blockDim.y + threadIdx.y;

	if (row < hiddenNodes && column < inputNodes) {
		float sum1 = 0, sum2 = 0;

		for (unsigned short i = 0; i < outputNodes; ++i) {
			sum1 += derivativeOutputLayer[i] * weightsInput[row * inputNodes + i] * (activatedOutputLayer[i] - targetOutput[i]);
		}

		for (unsigned short i = 0; i < inputNodes; ++i) {
			sum2 += input[i] * weightsInput[i * inputNodes + column];
		}

		weightsInput[row * inputNodes + column] -= learningRate * input[column] * sum1 * (sigmoid(sum2) * (1 - sigmoid(sum2)));
	}
}

Here is the kernel which update weights between the hidden layer and the output layer:

__global__ void updateWeightsHidden(
	float* __restrict__ weightsHidden,
	const float learningRate,
	const float* __restrict__ activatedHiddenLayer,
	const float* __restrict__ derivativeHiddenLayer,
	const float* __restrict__ activatedOutputLayer,
	const float* __restrict__ targetOutput,
	const unsigned short matrixRows,
	const unsigned short matrixColumns
) {
	const unsigned short row    = blockIdx.x * blockDim.x + threadIdx.x;
	const unsigned short column = blockIdx.y * blockDim.y + threadIdx.y;

	if (row < matrixRows && column < matrixColumns) {
		weightsHidden[row * matrixColumns + column] -= learningRate * activatedHiddenLayer[column] * derivativeHiddenLayer[row] * (activatedOutputLayer[row] - targetOutput[row]);
	}
}

If the full code is necessary here it is.