The full code for this project is available at https://github.com/echoline/fann. I am using two OpenGL compute shaders to run neural networks:
static const char* runShader = "#version 310 es\n"
"precision highp float;\n"
"layout(local_size_x = %d, local_size_y = 1, local_size_z = 1) in;\n"
"layout(std430) buffer;\n"
"layout(binding = 0) buffer Network\n"
"{\n"
" float e[];\n"
"} network;\n"
"layout(binding = 1) buffer Weights\n"
"{\n"
" float e[];\n"
"} weights;\n"
"layout(binding = 2) buffer Values\n"
"{\n"
" float e[];\n"
"} values;\n"
"layout(binding = 3) buffer Errors\n"
"{\n"
" float e[];\n"
"} errors;\n"
"layout(binding = 4) buffer Input\n"
"{\n"
" float e[];\n"
"} input_data;\n"
"layout(binding = 5) buffer Output\n"
"{\n"
" float e[];\n"
"} output_data;\n"
"void main()\n"
"{\n"
" int idx = int(gl_LocalInvocationID.x);\n"
" int threads = %d;\n"
" int layers;\n"
" int i, o, n, inputs, outputs, l, total_neurons, total_weights;\n"
" layers = int(network.e[0]) - 1;\n"
" inputs = int(network.e[1]);\n"
" for (i = idx; i < inputs; i += threads)\n"
" values.e[i] = input_data.e[i];\n"
" barrier();\n"
" total_neurons = 0;\n"
" total_weights = 0;\n"
" for (l = 1; l < layers; l++) {\n"
" inputs = int(network.e[l]);\n"
" outputs = int(network.e[l+1]);\n"
" if (idx == 0)\n"
" values.e[total_neurons + inputs] = 1.0;\n"
" barrier();\n"
" for (o = idx; o < outputs; o += threads) {\n"
" errors.e[o] = 0.0;\n"
" n = o * inputs + o;\n"
" for (i = 0; i <= inputs; i++)\n"
" errors.e[o] += values.e[total_neurons + i] * weights.e[total_weights + n + i];\n"
" }\n"
" total_neurons += inputs + 1;\n"
" for (o = idx; o < outputs; o += threads) {\n"
" errors.e[o] *= 0.5;\n"
" if (errors.e[o] > 300.0)\n"
" errors.e[o] = 300.0;\n"
" else if (errors.e[o] < -300.0)\n"
" errors.e[o] = -300.0;\n"
" if (errors.e[o] < 0.0)\n"
" errors.e[o] *= 0.01;\n"
" values.e[total_neurons + o] = errors.e[o];\n"
" }\n"
" barrier();\n"
" total_weights += inputs * outputs + outputs;\n"
" }\n"
" inputs = int(network.e[layers]);\n"
" outputs = int(network.e[layers+1]);\n"
" if (idx == 0)\n"
" values.e[total_neurons + inputs] = 1.0;\n"
" barrier();\n"
" for (o = idx; o < outputs; o += threads) {\n"
" errors.e[o] = 0.0;\n"
" n = o * inputs + o;\n"
" for (i = 0; i <= inputs; i++)\n"
" errors.e[o] += values.e[total_neurons + i] * weights.e[total_weights + n + i];\n"
" if (errors.e[o] > 600.0)\n"
" errors.e[o] = 600.0;\n"
" else if (errors.e[o] < -600.0)\n"
" errors.e[o] = -600.0;\n"
" values.e[total_neurons + inputs + 1 + o] = (1.0/(1.0 + exp(-errors.e[o])));\n"
" output_data.e[o] = values.e[total_neurons + inputs + 1 + o];\n"
" }\n"
" barrier();\n"
"}\n";
and train neural networks:
static const char* trainShader = "#version 310 es\n"
"precision highp float;\n"
"layout(local_size_x = %d, local_size_y = 1, local_size_z = 1) in;\n"
"layout(std430) buffer;\n"
"layout(binding = 0) buffer Network\n"
"{\n"
" float e[];\n"
"} network;\n"
"layout(binding = 1) buffer Weights\n"
"{\n"
" float e[];\n"
"} weights;\n"
"layout(binding = 2) buffer Values\n"
"{\n"
" float e[];\n"
"} values;\n"
"layout(binding = 3) buffer Errors\n"
"{\n"
" float e[];\n"
"} errors;\n"
"layout(binding = 4) buffer Input\n"
"{\n"
" float e[];\n"
"} input_data;\n"
"layout(binding = 5) buffer Output\n"
"{\n"
" float e[];\n"
"} output_data;\n"
"void main()\n"
"{\n"
" int idx = int(gl_LocalInvocationID.x);\n"
" int threads = %d;\n"
" int layers;\n"
" int i, o, l, total_neurons, total_weights, outputs, inputs, neuron_prev;\n"
" float neuron_diff, tmp_error;\n"
" layers = int(network.e[0]);\n"
" inputs = int(network.e[1]);\n"
" for (i = idx; i < inputs; i += threads)\n"
" values.e[i] = input_data.e[i];\n"
" barrier();\n"
" total_neurons = 0;\n"
" total_weights = 0;\n"
" for (l = 1; l < layers; l++) {\n"
" total_neurons += int(network.e[l]) + 1;\n"
" total_weights += (int(network.e[l]) + 1) * int(network.e[l+1]);\n"
" }\n"
" total_weights -= (int(network.e[layers-1]) + 1) * int(network.e[layers]);\n"
" outputs = int(network.e[layers]);\n"
" for (o = idx; o < outputs; o += threads) {\n"
" neuron_diff = output_data.e[o] - values.e[total_neurons + o];\n"
" if(neuron_diff < -.9999999)\n"
" neuron_diff = -17.0;\n"
" else if(neuron_diff > .9999999)\n"
" neuron_diff = 17.0;\n"
" else\n"
" neuron_diff = log((1.0 + neuron_diff) / (1.0 - neuron_diff));\n"
" errors.e[total_neurons + o] = neuron_diff * values.e[total_neurons + o] * (1.0 - values.e[total_neurons + o]);\n"
" }\n"
" barrier();\n"
" for (l = layers; l > 2; l--) {\n"
" outputs = int(network.e[l]);\n"
" inputs = int(network.e[l-1]);\n"
" neuron_prev = total_neurons - inputs - 1;\n"
" for (i = idx; i <= inputs; i += threads) {\n"
" errors.e[neuron_prev + i] = 0.0;\n"
" for (o = 0; o < outputs; o++)\n"
" errors.e[neuron_prev + i] += errors.e[total_neurons + o] * weights.e[total_weights + o * inputs + o + i];\n"
" errors.e[neuron_prev + i] *= 0.5;\n"
" if (values.e[neuron_prev + i] < 0.0)\n"
" errors.e[neuron_prev + i] *= 0.01;\n"
" }\n"
" barrier();\n"
" total_neurons = neuron_prev;\n"
" total_weights -= (int(network.e[l-2]) + 1) * inputs;\n"
" }\n"
" total_neurons = int(network.e[1]) + 1;\n"
" neuron_prev = 0;\n"
" total_weights = 0;\n"
" for (l = 2; l <= layers; l++) {\n"
" outputs = int(network.e[l]);\n"
" inputs = int(network.e[l-1]);\n"
" for (o = idx; o < outputs; o += threads) {\n"
" tmp_error = errors.e[total_neurons + o] * 0.7;\n"
" for (i = 0; i <= inputs; i++)\n"
" weights.e[total_weights + o * inputs + o + i] += tmp_error * values.e[neuron_prev + i];\n"
" }\n"
" barrier();\n"
" neuron_prev = total_neurons;\n"
" total_neurons += outputs + 1;\n"
" total_weights += outputs * inputs + outputs;\n"
" }\n"
"}\n";
local_size_x and the threads variable are set to the x value of GL_MAX_COMPUTE_WORK_GROUP_SIZE
SSBO Shader Storage Buffer Objects are allocated only once, then used over and over by both compute shaders:
GLfloat *data;
GLfloat *glvalues;
GLfloat *glweights;
int nparameters;
GLfloat *parameters;
int i;
struct fann_layer *layer_it;
glGenBuffers(1, &ann->glnetwork);
nparameters = 1;
nparameters += (int)(ann->last_layer - ann->first_layer);
parameters = calloc(sizeof(GLfloat), nparameters);
parameters[0] = nparameters - 1;
for(i = 1, layer_it = ann->first_layer; layer_it != ann->last_layer; layer_it++, i++)
parameters[i] = (int)(layer_it->last_neuron - layer_it->first_neuron) - 1;
glBindBuffer(GL_SHADER_STORAGE_BUFFER, ann->glnetwork);
glBufferData(GL_SHADER_STORAGE_BUFFER, nparameters * sizeof(GLfloat), parameters, GL_DYNAMIC_COPY);
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, ann->glnetwork);
glGenBuffers(1, &ann->glweights);
glweights = calloc(sizeof(GLfloat), ann->total_connections);
for (i = 0; i != ann->total_connections; i++)
glweights[i] = ann->weights[i];
glBindBuffer(GL_SHADER_STORAGE_BUFFER, ann->glweights);
glBufferData(GL_SHADER_STORAGE_BUFFER, ann->total_connections * sizeof(GLfloat), glweights, GL_DYNAMIC_COPY);
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, ann->glweights);
free(glweights);
glGenBuffers(1, &ann->glvalues);
glvalues = calloc(sizeof(GLfloat), ann->total_neurons);
for (i = 0; i != ann->total_neurons; i++)
glvalues[i] = ann->values[i];
glBindBuffer(GL_SHADER_STORAGE_BUFFER, ann->glvalues);
glBufferData(GL_SHADER_STORAGE_BUFFER, ann->total_neurons * sizeof(GLfloat), glvalues, GL_DYNAMIC_COPY);
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, ann->glvalues);
free(glvalues);
glGenBuffers(1, &ann->glerrors);
glBindBuffer(GL_SHADER_STORAGE_BUFFER, ann->glerrors);
glBufferData(GL_SHADER_STORAGE_BUFFER, ann->total_neurons * sizeof(GLfloat), NULL, GL_DYNAMIC_COPY);
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 3, ann->glerrors);
glGenBuffers(1, &ann->glinput);
glBindBuffer(GL_SHADER_STORAGE_BUFFER, ann->glinput);
glBufferStorage(GL_SHADER_STORAGE_BUFFER, ann->num_input * sizeof(GLfloat), NULL, GL_MAP_WRITE_BIT|GL_MAP_PERSISTENT_BIT|GL_MAP_COHERENT_BIT);
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 4, ann->glinput);
glBindBuffer(GL_SHADER_STORAGE_BUFFER, ann->glinput);
ann->glinputdata = (GLfloat*)glMapBufferRange(GL_SHADER_STORAGE_BUFFER, 0, ann->num_input * sizeof(GLfloat), GL_MAP_WRITE_BIT|GL_MAP_COHERENT_BIT|GL_MAP_PERSISTENT_BIT);
glGenBuffers(1, &ann->gloutput);
glBindBuffer(GL_SHADER_STORAGE_BUFFER, ann->gloutput);
glBufferStorage(GL_SHADER_STORAGE_BUFFER, ann->num_output * sizeof(GLfloat), NULL, GL_MAP_READ_BIT|GL_MAP_WRITE_BIT|GL_MAP_PERSISTENT_BIT|GL_MAP_COHERENT_BIT);
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 5, ann->gloutput);
glBindBuffer(GL_SHADER_STORAGE_BUFFER, ann->gloutput);
ann->gloutputdata = (GLfloat*)glMapBufferRange(GL_SHADER_STORAGE_BUFFER, 0, ann->num_output * sizeof(GLfloat), GL_MAP_READ_BIT|GL_MAP_WRITE_BIT|GL_MAP_COHERENT_BIT|GL_MAP_PERSISTENT_BIT);
the glDispatchCompute calls use only one workgroup:
for (i = 0; i < ann->num_input; i++)
ann->glinputdata[i] = input[i];
glFinish();
glUseProgram(ann->runShaderProgram);
glDispatchCompute(1, 1, 1);
glMemoryBarrier(GL_ALL_BARRIER_BITS);
glFinish();
for(i = 0; i != ann->num_output; i++)
ann->output[i] = ann->gloutputdata[i];
I have written a simple test program:
#include <floatfann.h>
void
fanntest(struct fann *ann, fann_type *input, fann_type *output, fann_type *desired_output, int gl)
{
double a, b;
struct timeval now;
int o;
ann->gl = gl;
gettimeofday(&now, NULL);
b = now.tv_sec * 1000000;
b += now.tv_usec;
fann_reset_MSE(ann);
fann_train(ann, input, desired_output);
gettimeofday(&now, NULL);
a = now.tv_sec * 1000000;
a += now.tv_usec;
fprintf(stderr, "%cPU: %f microseconds MSE: %0.10lf\n", gl? 'G': 'C', a - b, ann->MSE_value);
}
int
main(int argc, char **argv)
{
fann_type *input;
fann_type *output;
fann_type *desired_output;
struct fann *ann;
int i;
if (argc < 2)
return -1;
i = atoi(argv[1]);
ann = fann_create_standard(5, i, i, i, i, i);
fann_set_activation_function_hidden(ann, FANN_LINEAR_PIECE_LEAKY);
fann_set_activation_function_output(ann, FANN_SIGMOID);
input = calloc(sizeof(fann_type), ann->num_input);
desired_output = calloc(sizeof(fann_type), ann->num_output);
for (i = 0; i < ann->num_output; i++)
desired_output[i] = 0.73;
fann_print_parameters(ann);
for (i = 0; i < 10; i++)
fanntest(ann, input, output, desired_output, 1);
for (i = 0; i < 10; i++)
fanntest(ann, input, output, desired_output, 0);
return 0;
}
When training the networks, it works the same as the CPU implementation until the network size gets larger and larger. when it is larger than about 5 layers with 1024 nodes per layer, the results start to be different than the CPU after about the third training iteration.
it works fine with networks of 5 layers with 1000 neurons apiece:
GPU: 99080.000000 microseconds MSE: 53.1753273010
GPU: 54394.000000 microseconds MSE: 34.1738243103
GPU: 54567.000000 microseconds MSE: 7.3912019730
GPU: 54211.000000 microseconds MSE: 32.1625099182
GPU: 55041.000000 microseconds MSE: 9.7087984085
GPU: 54136.000000 microseconds MSE: 1.2110902071
GPU: 55459.000000 microseconds MSE: 1.5399940014
GPU: 52942.000000 microseconds MSE: 0.6823561192
GPU: 54997.000000 microseconds MSE: 0.6403807998
GPU: 54674.000000 microseconds MSE: 0.2719307244
CPU: 59415.000000 microseconds MSE: 53.1753273010
CPU: 39278.000000 microseconds MSE: 34.1738166809
CPU: 37671.000000 microseconds MSE: 7.3911962509
CPU: 37373.000000 microseconds MSE: 32.1625022888
CPU: 40650.000000 microseconds MSE: 9.7087860107
CPU: 39656.000000 microseconds MSE: 1.2110930681
CPU: 43019.000000 microseconds MSE: 1.5399991274
CPU: 41082.000000 microseconds MSE: 0.6823565364
CPU: 38982.000000 microseconds MSE: 0.6403820515
CPU: 37495.000000 microseconds MSE: 0.2719300091
but these are the results with 5 layers of 2000 neurons apiece:
GPU: 329097.000000 microseconds MSE: 107.3687744141
GPU: 236530.000000 microseconds MSE: 26.7987861633
GPU: 236830.000000 microseconds MSE: 143.8424530029
GPU: 236839.000000 microseconds MSE: 183.4096984863 <--- results start to be wrong
GPU: 236255.000000 microseconds MSE: 216.0950012207
GPU: 236346.000000 microseconds MSE: 98.7575836182
GPU: 236827.000000 microseconds MSE: 16.6932792664
GPU: 236557.000000 microseconds MSE: 145.7063598633
GPU: 237043.000000 microseconds MSE: 148.0571594238
GPU: 236431.000000 microseconds MSE: 149.8462677002
CPU: 177451.000000 microseconds MSE: 107.3687667847
CPU: 143699.000000 microseconds MSE: 26.7987976074
CPU: 142068.000000 microseconds MSE: 143.8424530029
CPU: 141851.000000 microseconds MSE: 104.6149063110
CPU: 141249.000000 microseconds MSE: 22.5739612579
CPU: 144787.000000 microseconds MSE: 77.6188964844
CPU: 141921.000000 microseconds MSE: 53.6443481445
CPU: 142287.000000 microseconds MSE: 4.9292449951
CPU: 145242.000000 microseconds MSE: 2.5167086124
CPU: 140531.000000 microseconds MSE: 2.7396082878
My question is, why does the GPU start giving such different results with the larger neural networks after the third training iteration? It must not be a floating point calculation difference because the results are about the same until the network is large enough. It only starts going wrong after a few iterations.