In this simple NN program, I found that once I use more than 512 threads on my C1060 the performance dramatically increases. With fewer 512 threads or fewer however the GPU program actually goes slower than the CPU version.
The best part: If I change the program to always use 513 threads but for all the unneeded threads to simply do nothing, I also get the dramatically improved performance. (this is shown in the commented out regions of the attached code)
Why is this?
Thanks, -CS
DATA:
Nodes CPU GPU
25 0.01 0.1
50 0.02 0.15
75 0.04 0.2
100 0.07 0.24
200 0.33 0.44
300 0.61 0.73
400 1.07 1.12
500 1.67 1.9
511 1.78 2.01
512 1.75 2.79
513 1.78 0.11
514 1.77 0.11
516 1.78 0.11
520 1.85 0.11
525 1.85 0.15
550 2.02 0.11
600 2.43 0.12
700 3.41 0.14
800 4.57 0.14
900 5.79 0.13
1000 7.17 0.13
CODE:
// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <cutil.h>
#include <cutil_inline.h>
#include <quickSigmoid.c>
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
#define NODES 100
#define LEVELS 4
#define NEURALNET_INT_CONVERSION 0xFFFF
global void runNet(float* weights, float* mids, int* sigTab)
{
int tn = threadIdx.x;
//if(tn < NODES)
// {
for(int i = 0; i < LEVELS; i++)
{
for(int k = 0; k < NODES; k++)
{
mids[(i+1)*NODES + tn] = mids[i*NODES + k]*weights[i*NODES*NODES + tn*NODES + k];
}
mids[(i+1)*NODES + tn] = (float)d_quickSigmoid_Sigmoid(sigTab,(int)(mids[(i+1)*NODES + tn]*NEURALNET_INT_CONVERSION)) / (float)NEURALNET_INT_CONVERSION;
__syncthreads();
}
// }
}
int main(int argc, char** argv)
{
CUT_DEVICE_INIT(argc, argv);
float weights[NODESNODESLEVELS];
float mids[NODES*(LEVELS+1)];
float* d_weights;
float* d_mids;
unsigned int gT;
unsigned int cT;
cutCreateTimer(&cT);
cutCreateTimer(&gT);
quickSigmoid_init();
d_quickSigmoid_init();
//INIT
for(int i = 0; i < NODES; i++)
{
mids[i] = (rand() / (float)RAND_MAX);
}
for(int i = NODES; i < NODES*(LEVELS); i++)
{
mids[i] = 0;
}
for(int i = 0; i < NODESNODESLEVELS; i++)
{
weights[i] = (2*(rand() / (float)RAND_MAX))-1;
}
cudaMalloc((void**) &d_weights,sizeof(float) * NODESNODESLEVELS);
cudaMalloc((void**) &d_mids,sizeof(float)NODES(LEVELS+1));
cudaMemcpy(d_weights, weights, sizeof(float)NODESNODES*LEVELS, cudaMemcpyHostToDevice);
cudaMemcpy(d_mids, mids , sizeof(float)NODES(LEVELS+1), cudaMemcpyHostToDevice);
//RUN GPU
printf(“Running GPU\n”);
cutStartTimer(gT);
cudaThreadSynchronize();
//dim3 threads(550,1);
dim3 threads(NODES,1);
dim3 grid(1,1);
runNet<<< grid, threads>>>(d_weights,d_mids,d_quickSigmoid_SigTab);
cudaThreadSynchronize();
cutStopTimer(gT);
//RUN NORMAL
printf(“Running CPU\n”);
cutStartTimer(cT);
for(int i = 0; i < LEVELS; i++)
{
for(int j = 0; j < NODES; j++)
{
for(int k = 0; k < NODES; k++)
{
mids[(i+1)*NODES + j] = mids[i*NODES + k]*weights[i*NODES*NODES+j*NODES + k];
}
mids[(i+1)*NODES + j] = (float)quickSigmoid_Sigmoid((int)(mids[(i+1)*NODES + j]*NEURALNET_INT_CONVERSION)) / (float)NEURALNET_INT_CONVERSION;
}
}
cutStopTimer(cT);
printf(“CPU: %f\n”,cutGetTimerValue(cT));
printf(“GPU: %f\n”,cutGetTimerValue(gT));
CUT_EXIT(argc, argv);
}