Unexpected OpenGL Compute Shader difference from CPU implementation of neural network

The full code for this project is available at https://github.com/echoline/fann. I am using two OpenGL compute shaders to run neural networks:

static const char* runShader = "#version 310 es\n"
	"precision highp float;\n"
	"layout(local_size_x = %d, local_size_y = 1, local_size_z = 1) in;\n"
	"layout(std430) buffer;\n"
	"layout(binding = 0) buffer Network\n"
	"{\n"
	"	float e[];\n"
	"} network;\n"
	"layout(binding = 1) buffer Weights\n"
	"{\n"
	"	float e[];\n"
	"} weights;\n"
	"layout(binding = 2) buffer Values\n"
	"{\n"
	"	float e[];\n"
	"} values;\n"
	"layout(binding = 3) buffer Errors\n"
	"{\n"
	"	float e[];\n"
	"} errors;\n"
	"layout(binding = 4) buffer Input\n"
	"{\n"
	"	float e[];\n"
	"} input_data;\n"
	"layout(binding = 5) buffer Output\n"
	"{\n"
	"	float e[];\n"
	"} output_data;\n"
	"void main()\n"
	"{\n"
	"	int idx = int(gl_LocalInvocationID.x);\n"
	"	int threads = %d;\n"
	"	int layers;\n"
	"	int i, o, n, inputs, outputs, l, total_neurons, total_weights;\n"
	"	layers = int(network.e[0]) - 1;\n"
	"	inputs = int(network.e[1]);\n"
	"	for (i = idx; i < inputs; i += threads)\n"
	"		values.e[i] = input_data.e[i];\n"
	"	barrier();\n"
	"	total_neurons = 0;\n"
	"	total_weights = 0;\n"
	"	for (l = 1; l < layers; l++) {\n"
	"		inputs = int(network.e[l]);\n"
	"		outputs = int(network.e[l+1]);\n"
	"		if (idx == 0)\n"
	"			values.e[total_neurons + inputs] = 1.0;\n"
	"		barrier();\n"
	"		for (o = idx; o < outputs; o += threads) {\n"
	"			errors.e[o] = 0.0;\n"
	"			n = o * inputs + o;\n"
	"			for (i = 0; i <= inputs; i++)\n"
	"				errors.e[o] += values.e[total_neurons + i] * weights.e[total_weights + n + i];\n"
	"		}\n"
	"		total_neurons += inputs + 1;\n"
	"		for (o = idx; o < outputs; o += threads) {\n"
	"			errors.e[o] *= 0.5;\n"
	"			if (errors.e[o] > 300.0)\n"
	"				errors.e[o] = 300.0;\n"
	"			else if (errors.e[o] < -300.0)\n"
	"				errors.e[o] = -300.0;\n"
	"			if (errors.e[o] < 0.0)\n"
	"				errors.e[o] *= 0.01;\n"
	"			values.e[total_neurons + o] = errors.e[o];\n"
	"		}\n"
	"		barrier();\n"
	"		total_weights += inputs * outputs + outputs;\n"
	"	}\n"
	"	inputs = int(network.e[layers]);\n"
	"	outputs = int(network.e[layers+1]);\n"
	"	if (idx == 0)\n"
	"		values.e[total_neurons + inputs] = 1.0;\n"
	"	barrier();\n"
	"	for (o = idx; o < outputs; o += threads) {\n"
	"		errors.e[o] = 0.0;\n"
	"		n = o * inputs + o;\n"
	"		for (i = 0; i <= inputs; i++)\n"
	"			errors.e[o] += values.e[total_neurons + i] * weights.e[total_weights + n + i];\n"
	"		if (errors.e[o] > 600.0)\n"
	"			errors.e[o] = 600.0;\n"
	"		else if (errors.e[o] < -600.0)\n"
	"			errors.e[o] = -600.0;\n"
	"		values.e[total_neurons + inputs + 1 + o] = (1.0/(1.0 + exp(-errors.e[o])));\n"
	"		output_data.e[o] = values.e[total_neurons + inputs + 1 + o];\n"
	"	}\n"
	"	barrier();\n"
	"}\n";

and train neural networks:

static const char* trainShader = "#version 310 es\n"
	"precision highp float;\n"
	"layout(local_size_x = %d, local_size_y = 1, local_size_z = 1) in;\n"
	"layout(std430) buffer;\n"
	"layout(binding = 0) buffer Network\n"
	"{\n"
	"	float e[];\n"
	"} network;\n"
	"layout(binding = 1) buffer Weights\n"
	"{\n"
	"	float e[];\n"
	"} weights;\n"
	"layout(binding = 2) buffer Values\n"
	"{\n"
	"	float e[];\n"
	"} values;\n"
	"layout(binding = 3) buffer Errors\n"
	"{\n"
	"	float e[];\n"
	"} errors;\n"
	"layout(binding = 4) buffer Input\n"
	"{\n"
	"	float e[];\n"
	"} input_data;\n"
	"layout(binding = 5) buffer Output\n"
	"{\n"
	"	float e[];\n"
	"} output_data;\n"
	"void main()\n"
	"{\n"
	"	int idx = int(gl_LocalInvocationID.x);\n"
	"	int threads = %d;\n"
	"	int layers;\n"
	"	int i, o, l, total_neurons, total_weights, outputs, inputs, neuron_prev;\n"
	"	float neuron_diff, tmp_error;\n"
	"	layers = int(network.e[0]);\n"
	"	inputs = int(network.e[1]);\n"
	"	for (i = idx; i < inputs; i += threads)\n"
	"		values.e[i] = input_data.e[i];\n"
	"	barrier();\n"
	"	total_neurons = 0;\n"
	"	total_weights = 0;\n"
	"	for (l = 1; l < layers; l++) {\n"
	"		total_neurons += int(network.e[l]) + 1;\n"
	"		total_weights += (int(network.e[l]) + 1) * int(network.e[l+1]);\n"
	"	}\n"
	"	total_weights -= (int(network.e[layers-1]) + 1) * int(network.e[layers]);\n"
	"	outputs = int(network.e[layers]);\n"
	"	for (o = idx; o < outputs; o += threads) {\n"
	"		neuron_diff = output_data.e[o] - values.e[total_neurons + o];\n"
	"		if(neuron_diff < -.9999999)\n"
	"			neuron_diff = -17.0;\n"
	"		else if(neuron_diff > .9999999)\n"
	"			neuron_diff = 17.0;\n"
	"		else\n"
	"			neuron_diff = log((1.0 + neuron_diff) / (1.0 - neuron_diff));\n"
	"		errors.e[total_neurons + o] = neuron_diff * values.e[total_neurons + o] * (1.0 - values.e[total_neurons + o]);\n"
	"	}\n"
	"	barrier();\n"
	"	for (l = layers; l > 2; l--) {\n"
	"		outputs = int(network.e[l]);\n"
	"		inputs = int(network.e[l-1]);\n"
	"		neuron_prev = total_neurons - inputs - 1;\n"
	"		for (i = idx; i <= inputs; i += threads) {\n"
	"			errors.e[neuron_prev + i] = 0.0;\n"
	"			for (o = 0; o < outputs; o++)\n"
	"				errors.e[neuron_prev + i] += errors.e[total_neurons + o] * weights.e[total_weights + o * inputs + o + i];\n"
	"			errors.e[neuron_prev + i] *= 0.5;\n"
	"			if (values.e[neuron_prev + i] < 0.0)\n"
	"				errors.e[neuron_prev + i] *= 0.01;\n"
	"		}\n"
	"		barrier();\n"
	"		total_neurons = neuron_prev;\n"
	"		total_weights -= (int(network.e[l-2]) + 1) * inputs;\n"
	"	}\n"
	"	total_neurons = int(network.e[1]) + 1;\n"
	"	neuron_prev = 0;\n"
	"	total_weights = 0;\n"
	"	for (l = 2; l <= layers; l++) {\n"
	"		outputs = int(network.e[l]);\n"
	"		inputs = int(network.e[l-1]);\n"
	"		for (o = idx; o < outputs; o += threads) {\n"
	"			tmp_error = errors.e[total_neurons + o] * 0.7;\n"
	"			for (i = 0; i <= inputs; i++)\n"
	"				weights.e[total_weights + o * inputs + o + i] += tmp_error * values.e[neuron_prev + i];\n"
	"		}\n"
	"		barrier();\n"
	"		neuron_prev = total_neurons;\n"
	"		total_neurons += outputs + 1;\n"
	"		total_weights += outputs * inputs + outputs;\n"
	"	}\n"
	"}\n";

local_size_x and the threads variable are set to the x value of GL_MAX_COMPUTE_WORK_GROUP_SIZE

SSBO Shader Storage Buffer Objects are allocated only once, then used over and over by both compute shaders:

	GLfloat *data;
	GLfloat *glvalues;
	GLfloat *glweights;
	int nparameters;
	GLfloat *parameters;
	int i;
	struct fann_layer *layer_it;

	glGenBuffers(1, &ann->glnetwork);

	nparameters = 1;
	nparameters += (int)(ann->last_layer - ann->first_layer);
	parameters = calloc(sizeof(GLfloat), nparameters);
	parameters[0] = nparameters - 1;
	for(i = 1, layer_it = ann->first_layer; layer_it != ann->last_layer; layer_it++, i++)
		parameters[i] = (int)(layer_it->last_neuron - layer_it->first_neuron) - 1;

	glBindBuffer(GL_SHADER_STORAGE_BUFFER, ann->glnetwork);
	glBufferData(GL_SHADER_STORAGE_BUFFER, nparameters * sizeof(GLfloat), parameters, GL_DYNAMIC_COPY);
	glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, ann->glnetwork);

	glGenBuffers(1, &ann->glweights);

	glweights = calloc(sizeof(GLfloat), ann->total_connections);
	for (i = 0; i != ann->total_connections; i++)
		glweights[i] = ann->weights[i];

	glBindBuffer(GL_SHADER_STORAGE_BUFFER, ann->glweights);
	glBufferData(GL_SHADER_STORAGE_BUFFER, ann->total_connections * sizeof(GLfloat), glweights, GL_DYNAMIC_COPY);
	glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, ann->glweights);

	free(glweights);

	glGenBuffers(1, &ann->glvalues);

	glvalues = calloc(sizeof(GLfloat), ann->total_neurons);
	for (i = 0; i != ann->total_neurons; i++)
		glvalues[i] = ann->values[i];

	glBindBuffer(GL_SHADER_STORAGE_BUFFER, ann->glvalues);
	glBufferData(GL_SHADER_STORAGE_BUFFER, ann->total_neurons * sizeof(GLfloat), glvalues, GL_DYNAMIC_COPY);
	glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, ann->glvalues);

	free(glvalues);

	glGenBuffers(1, &ann->glerrors);

	glBindBuffer(GL_SHADER_STORAGE_BUFFER, ann->glerrors);
	glBufferData(GL_SHADER_STORAGE_BUFFER, ann->total_neurons * sizeof(GLfloat), NULL, GL_DYNAMIC_COPY);
	glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 3, ann->glerrors);

	glGenBuffers(1, &ann->glinput);

	glBindBuffer(GL_SHADER_STORAGE_BUFFER, ann->glinput);
	glBufferStorage(GL_SHADER_STORAGE_BUFFER, ann->num_input * sizeof(GLfloat), NULL, GL_MAP_WRITE_BIT|GL_MAP_PERSISTENT_BIT|GL_MAP_COHERENT_BIT);
	glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 4, ann->glinput);
	glBindBuffer(GL_SHADER_STORAGE_BUFFER, ann->glinput);
	ann->glinputdata = (GLfloat*)glMapBufferRange(GL_SHADER_STORAGE_BUFFER, 0, ann->num_input * sizeof(GLfloat), GL_MAP_WRITE_BIT|GL_MAP_COHERENT_BIT|GL_MAP_PERSISTENT_BIT);

	glGenBuffers(1, &ann->gloutput);

	glBindBuffer(GL_SHADER_STORAGE_BUFFER, ann->gloutput);
	glBufferStorage(GL_SHADER_STORAGE_BUFFER, ann->num_output * sizeof(GLfloat), NULL, GL_MAP_READ_BIT|GL_MAP_WRITE_BIT|GL_MAP_PERSISTENT_BIT|GL_MAP_COHERENT_BIT);
	glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 5, ann->gloutput);
	glBindBuffer(GL_SHADER_STORAGE_BUFFER, ann->gloutput);
	ann->gloutputdata = (GLfloat*)glMapBufferRange(GL_SHADER_STORAGE_BUFFER, 0, ann->num_output * sizeof(GLfloat), GL_MAP_READ_BIT|GL_MAP_WRITE_BIT|GL_MAP_COHERENT_BIT|GL_MAP_PERSISTENT_BIT);

the glDispatchCompute calls use only one workgroup:

	for (i = 0; i < ann->num_input; i++)
		ann->glinputdata[i] = input[i];

	glFinish();
	glUseProgram(ann->runShaderProgram);
	glDispatchCompute(1, 1, 1);
	glMemoryBarrier(GL_ALL_BARRIER_BITS);
	glFinish();

	for(i = 0; i != ann->num_output; i++)
		ann->output[i] = ann->gloutputdata[i];

I have written a simple test program:

#include <floatfann.h>

void
fanntest(struct fann *ann, fann_type *input, fann_type *output, fann_type *desired_output, int gl)
{
	double a, b;
	struct timeval now;
	int o;

	ann->gl = gl;

	gettimeofday(&now, NULL);
	b = now.tv_sec * 1000000;
	b += now.tv_usec;

	fann_reset_MSE(ann);
	fann_train(ann, input, desired_output);

	gettimeofday(&now, NULL);
	a = now.tv_sec * 1000000;
	a += now.tv_usec;

	fprintf(stderr, "%cPU: %f microseconds MSE: %0.10lf\n", gl? 'G': 'C', a - b, ann->MSE_value);
}

int
main(int argc, char **argv)
{
	fann_type *input;
	fann_type *output;
	fann_type *desired_output;
	struct fann *ann;
	int i;

	if (argc < 2)
		return -1;

	i = atoi(argv[1]);

	ann = fann_create_standard(5, i, i, i, i, i);
	fann_set_activation_function_hidden(ann, FANN_LINEAR_PIECE_LEAKY);
	fann_set_activation_function_output(ann, FANN_SIGMOID);
	input = calloc(sizeof(fann_type), ann->num_input);
	desired_output = calloc(sizeof(fann_type), ann->num_output);

	for (i = 0; i < ann->num_output; i++)
		desired_output[i] = 0.73;

	fann_print_parameters(ann);

	for (i = 0; i < 10; i++)
		fanntest(ann, input, output, desired_output, 1);
	for (i = 0; i < 10; i++)
		fanntest(ann, input, output, desired_output, 0);

	return 0;
}

When training the networks, it works the same as the CPU implementation until the network size gets larger and larger. when it is larger than about 5 layers with 1024 nodes per layer, the results start to be different than the CPU after about the third training iteration.

it works fine with networks of 5 layers with 1000 neurons apiece:

GPU: 99080.000000 microseconds MSE: 53.1753273010
GPU: 54394.000000 microseconds MSE: 34.1738243103
GPU: 54567.000000 microseconds MSE: 7.3912019730
GPU: 54211.000000 microseconds MSE: 32.1625099182
GPU: 55041.000000 microseconds MSE: 9.7087984085
GPU: 54136.000000 microseconds MSE: 1.2110902071
GPU: 55459.000000 microseconds MSE: 1.5399940014
GPU: 52942.000000 microseconds MSE: 0.6823561192
GPU: 54997.000000 microseconds MSE: 0.6403807998
GPU: 54674.000000 microseconds MSE: 0.2719307244

CPU: 59415.000000 microseconds MSE: 53.1753273010
CPU: 39278.000000 microseconds MSE: 34.1738166809
CPU: 37671.000000 microseconds MSE: 7.3911962509
CPU: 37373.000000 microseconds MSE: 32.1625022888
CPU: 40650.000000 microseconds MSE: 9.7087860107
CPU: 39656.000000 microseconds MSE: 1.2110930681
CPU: 43019.000000 microseconds MSE: 1.5399991274
CPU: 41082.000000 microseconds MSE: 0.6823565364
CPU: 38982.000000 microseconds MSE: 0.6403820515
CPU: 37495.000000 microseconds MSE: 0.2719300091

but these are the results with 5 layers of 2000 neurons apiece:

GPU: 329097.000000 microseconds MSE: 107.3687744141
GPU: 236530.000000 microseconds MSE: 26.7987861633
GPU: 236830.000000 microseconds MSE: 143.8424530029
GPU: 236839.000000 microseconds MSE: 183.4096984863 <--- results start to be wrong
GPU: 236255.000000 microseconds MSE: 216.0950012207
GPU: 236346.000000 microseconds MSE: 98.7575836182
GPU: 236827.000000 microseconds MSE: 16.6932792664
GPU: 236557.000000 microseconds MSE: 145.7063598633
GPU: 237043.000000 microseconds MSE: 148.0571594238
GPU: 236431.000000 microseconds MSE: 149.8462677002

CPU: 177451.000000 microseconds MSE: 107.3687667847
CPU: 143699.000000 microseconds MSE: 26.7987976074
CPU: 142068.000000 microseconds MSE: 143.8424530029
CPU: 141851.000000 microseconds MSE: 104.6149063110
CPU: 141249.000000 microseconds MSE: 22.5739612579
CPU: 144787.000000 microseconds MSE: 77.6188964844
CPU: 141921.000000 microseconds MSE: 53.6443481445
CPU: 142287.000000 microseconds MSE: 4.9292449951
CPU: 145242.000000 microseconds MSE: 2.5167086124
CPU: 140531.000000 microseconds MSE: 2.7396082878

My question is, why does the GPU start giving such different results with the larger neural networks after the third training iteration? It must not be a floating point calculation difference because the results are about the same until the network is large enough. It only starts going wrong after a few iterations.