gpu performance tester weird gpu results

I wrote this simple code today to do some basic benchmarking. Basically all that is does is calculate pi in three diffrent ways for each element of a stream (x,y,z compnents) on the gpu and the cpu. The bigger the stream, the longer it take on the CPU, but for some reason, it looks like it is taking the same amount of time on the GPU regradless of the sream size… What am I doing wrong?

This is the output

size = 256,     t_g = 0.809449 s,       t_c = 17.136713 s,      speedup 21.170845 x

size = 512,     t_g = 0.808610 s,       t_c = 34.299225 s,      speedup 42.417538 x

size = 768,     t_g = 0.809715 s,       t_c = 51.471620 s,      speedup 63.567544 x

size = 1024,    t_g = 0.808499 s,       t_c = 68.520511 s,      speedup 84.750234 x

size = 1280,    t_g = 0.809134 s,       t_c = 85.705849 s,      speedup 105.922963 x

size = 1536,    t_g = 0.808985 s,       t_c = 102.908285 s,     speedup 127.206653 x

size = 1792,    t_g = 0.808505 s,       t_c = 119.568844 s,     speedup 147.888785 x

size = 2048,    t_g = 0.807108 s,       t_c = 135.830341 s,     speedup 168.292548 x

size = 2304,    t_g = 0.808790 s,       t_c = 153.502859 s,     speedup 189.793114 x

size = 2560,    t_g = 0.807486 s,       t_c = 169.191967 s,     speedup 209.529397 x

fdtd complete: total runtime = 946.580994s, gpu = 8.086282s, cpu = 938.136230s, speedup = 116.015770 x

Here is the code

#include <stdio.h>

#include <stdlib.h>

#include <string.h>

#include <time.h>

#include <cuda_runtime_api.h>

#define PI_50 3.14159265358979323846264338327950288419716939937510

__device__ float4 *devMem = NULL;

int dim_x = 256;

__global__ void init(float4 *init_me){

        int idx = blockDim.x * blockIdx.x + threadIdx.x;

       init_me[idx].x = 0.0;

        init_me[idx].y = 0.0;

        init_me[idx].z = 0.0;

        init_me[idx].w = 0.0;

}

__global__ void plusOne(float4 *add_to_me){ 

        int idx = blockDim.x * blockIdx.x + threadIdx.x;

        add_to_me[idx].x += 1;

}

__host__ __device__ float bellardBBPTypeFormula(){

        int i;

        float pi = 0.0;

       for(i = 0; i < 10000; i++){

                pi += ( \

                        (powf(-1.0, (float)i) / powf(2.0, 10.0 * (float)i)) * \

                        (       -(powf(2.0, 5.0) / (4.0 * (float)i + 1.0)) \

                                -(1.0 / (4.0 * (float)i + 3.0)) \

                                +(powf(2.0, 8.0) / (10.0 * (float)i + 1.0)) \

                                -(powf(2.0, 6.0) / (10.0 * (float)i + 3.0)) \

                                -(powf(2.0, 2.0) / (10.0 * (float)i + 5.0)) \

                                -(powf(2.0, 2.0) / (10.0 * (float)i + 7.0)) \

                                +(1.0 / (10.0 * (float)i + 9.0)) \

                        ) \

                        );

        }

        pi *= (1.0 / powf(2.0, 6.0));

        return pi;

}__host__ __device__ float gregoryLeibnizSeries(){

        int i;

        float pi = 0.0;

        for(i = 0; i < 1000000; i++){

                pi += powf(-1.0, (float)i) * (4.0 / (1.0 + 2.0 * (float)i));

        }

        return pi;

}

__host__ __device__ float johnMachinMethod(){

        return 4.0 * (4.0 * atanf(1.0 / 5.0) - atanf(1.0 / 239.0));

}

__global__ void calcPi(float4 *pi){

        int idx = blockDim.x * blockIdx.x + threadIdx.x;

       pi[idx].x = (float)idx * johnMachinMethod();

        pi[idx].y = (float)idx * gregoryLeibnizSeries();

        pi[idx].z = (float)idx * bellardBBPTypeFormula();

        pi[idx].w = (float)idx * bellardBBPTypeFormula();

}

int main(int argc, char **argv){

        struct timespec start, stop;

        struct timespec gpu_start, gpu_stop;

        struct timespec cpu_start, cpu_stop;

        int i;

        int j;

        float runtime = 0.0, gpu_runtime = 0.0, cpu_runtime = 0.0;

        float4 *result = NULL;

        float4 *hostMem = NULL;

        double t_c, t_g;

       clock_gettime(CLOCK_PROCESS_CPUTIME_ID,&start);

       for(j = 1; j <= 10; j++){

               cudaFree(devMem);

                free(result);

                free(hostMem);

               clock_gettime(CLOCK_PROCESS_CPUTIME_ID,&gpu_start);

                result = (float4*)calloc(sizeof(float4),dim_x*j);

                cudaMalloc((void**)&devMem, j*dim_x*sizeof(float4));

                init<<<j,dim_x>>>(devMem);

                calcPi<<<j,dim_x>>>(devMem);

                cudaMemcpy(result, devMem, j*dim_x*sizeof(float4), cudaMemcpyDeviceToHost);

                clock_gettime(CLOCK_PROCESS_CPUTIME_ID,&gpu_stop);

               clock_gettime(CLOCK_PROCESS_CPUTIME_ID,&cpu_start);

                hostMem = (float4*)calloc(sizeof(float4),dim_x*j);

                for(i = 0; i < dim_x*j; i++){

                        hostMem[i].x = (float)i + johnMachinMethod();

                        hostMem[i].y = (float)i + gregoryLeibnizSeries();

                        hostMem[i].z = (float)i + bellardBBPTypeFormula();

                        hostMem[i].w = (float)i + bellardBBPTypeFormula();

                }

                clock_gettime(CLOCK_PROCESS_CPUTIME_ID,&cpu_stop);

               for(i = 0; i < dim_x*j; i++){

                        if(abs(hostMem[i].x - result[i].x) > 0.01) fprintf(stderr,"%i)X) greater than 1%% (%f) diffrence between cpu (%f) and gpu (%f)\n",i,abs(hostMem[i].x - result[i].x), hostMem[i].x, result[i].x);

                        if(abs(hostMem[i].y - result[i].y) > 0.01) fprintf(stderr,"%i)Y) greater than 1%% (%f) diffrence between cpu (%f) and gpu (%f)\n",i,abs(hostMem[i].y - result[i].y), hostMem[i].y, result[i].y);

                        if(abs(hostMem[i].z - result[i].z) > 0.01) fprintf(stderr,"%i)Z) greater than 1%% (%f) diffrence between cpu (%f) and gpu (%f)\n",i,abs(hostMem[i].z - result[i].z), hostMem[i].z, result[i].z);

                        if(abs(hostMem[i].w - result[i].w) > 0.01) fprintf(stderr,"%i)W) greater than 1%% (%f) diffrence between cpu (%f) and gpu (%f)\n",i,abs(hostMem[i].w - result[i].w), hostMem[i].w, result[i].w);

                }

               t_c = (double)(cpu_stop.tv_sec - cpu_start.tv_sec) + (double)(cpu_stop.tv_nsec - cpu_start.tv_nsec)/1000000000.0;

                t_g = (double)(gpu_stop.tv_sec - gpu_start.tv_sec) + (double)(gpu_stop.tv_nsec - gpu_start.tv_nsec)/1000000000.0;

               cpu_runtime += t_c;

                gpu_runtime += t_g;

               printf("size = %i,\tt_g = %f s,\tt_c = %f s,\tspeedup %f x\n",j*dim_x, t_g, t_c, t_c/t_g);

        }

       clock_gettime(CLOCK_PROCESS_CPUTIME_ID,&stop);

        runtime = (double)(stop.tv_sec - start.tv_sec) + (double)(stop.tv_nsec - start.tv_nsec)/1000000000.0;

       printf("fdtd complete: total runtime = %fs, gpu = %fs, cpu = %fs, speedup = %f x\n",runtime, gpu_runtime, cpu_runtime, cpu_runtime/gpu_runtime);

        return 0;

}

It’s not a bug, it’s a feature. No really!

When you increase the size of the stream you distribute the work pretty smart on all the GPU processors so the more you add the more the GPU calculates in parallel and the speedup increases.

For every iteration of for(j = 1; j <= 10; j++) you assign the blocksize j to the kernel (calcPi<<<j,dim_x>>>(devMem);). So you increase the number of blocks while increasing the streamsize. And blocks are the entities the GPU can execute in parallel.

It also can execute threads in parallel but that’s one level below blocks and not the significant factor in this case.

For j=1 you start 1 block. This block will be assigned to one multiprocessor and the multiprocessor will calculate the block. For j=2 you start 2 blocks and those will be assigned to multiprocessors so that they can run in a parallel manner. You double the work and the time stays the same. You can do this as long as the GPU has capacities left to parallelize. At some point the GPU will have to serialize some blocks and then your execution time should increase.

Another way to see this is that when you run the kernel with j=1 the GPU will not be working at full capacity, not even half of it’s capacity. This has to do with occupancy - you could have a look at the occupancy calculator to find out more about how the GPU works.

Thanks, I guess I wasn’t realizing that 1 block is for 1 multi processor, and since I was only running up to 10 blocks, I wasn’t saturating my GPU (8800 Ultra) yet. I would suppose that after 16 blocks, I will reach the saturatoin point and not all of the blocks can be executed in parallel.

Yes and no. One multiprocessor can actually calculate more than 1 block at a time, depending on the number of registers you use per thread and the number of threads you run per block. Heres the occupancy calculator: http://forums.nvidia.com/index.php?showtopic=31279

As seb mentioned, you definitely want to have more blocks than multiprocessors. This allows the scheduler to hide memory latency by running one block while the other block is waiting for global reads. It can make a surprising difference sometimes.

I have been running for more time, and looks like it is starting to converge to a peak speedup of about 333x. This is a 8800 Ultra vs Core 2 Quad @ 2.4 GHz. Does 333x speed up seems about right? Only one core is running on the CPU.

nvcc -c howto_streams.cu -I. -I/usr/local/cuda/include

g++ -o stream_tester  howto_streams.o  -lrt -lm -lcudart -I. -I/usr/local/cuda/include -L/usr/local/cuda/lib

LD_LIBRARY_PATH=/usr/local/cuda/lib ./stream_tester

size = 256,     t_g = 0.809483 s,       t_c = 16.934240 s,      speedup 20.919817 x

size = 512,     t_g = 0.808452 s,       t_c = 33.955900 s,      speedup 42.001140 x

size = 768,     t_g = 0.808969 s,       t_c = 50.626780 s,      speedup 62.581872 x

size = 1024,    t_g = 0.808982 s,       t_c = 67.420405 s,      speedup 83.339804 x

size = 1280,    t_g = 0.811770 s,       t_c = 84.882961 s,      speedup 104.565313 x

size = 1536,    t_g = 0.808998 s,       t_c = 101.534036 s,     speedup 125.505889 x

size = 1792,    t_g = 0.809008 s,       t_c = 118.700908 s,     speedup 146.723971 x

size = 2048,    t_g = 0.809000 s,       t_c = 135.407021 s,     speedup 167.375696 x

size = 2304,    t_g = 0.809012 s,       t_c = 152.629377 s,     speedup 188.661365 x

size = 2560,    t_g = 0.809042 s,       t_c = 169.134643 s,     speedup 209.055489 x

size = 2816,    t_g = 0.809014 s,       t_c = 186.648482 s,     speedup 230.711134 x

size = 3072,    t_g = 0.809033 s,       t_c = 203.437381 s,     speedup 251.457596 x

size = 3328,    t_g = 0.808504 s,       t_c = 219.850719 s,     speedup 271.922960 x

size = 3584,    t_g = 0.809394 s,       t_c = 237.406458 s,     speedup 293.313998 x

size = 3840,    t_g = 0.809031 s,       t_c = 253.833718 s,     speedup 313.750235 x

size = 4096,    t_g = 0.808546 s,       t_c = 270.267248 s,     speedup 334.263301 x

size = 4352,    t_g = 1.630859 s,       t_c = 287.655516 s,     speedup 176.382814 x

size = 4608,    t_g = 1.630898 s,       t_c = 304.711656 s,     speedup 186.836690 x

size = 4864,    t_g = 1.630894 s,       t_c = 321.097602 s,     speedup 196.884449 x

size = 5120,    t_g = 1.630973 s,       t_c = 338.058857 s,     speedup 207.274397 x

size = 5376,    t_g = 1.630921 s,       t_c = 355.583852 s,     speedup 218.026396 x

size = 5632,    t_g = 1.630957 s,       t_c = 372.167579 s,     speedup 228.189651 x

size = 5888,    t_g = 1.631412 s,       t_c = 389.079219 s,     speedup 238.492327 x

size = 6144,    t_g = 1.631346 s,       t_c = 405.546981 s,     speedup 248.596607 x

size = 6400,    t_g = 1.630960 s,       t_c = 423.331917 s,     speedup 259.559902 x

size = 6656,    t_g = 1.631343 s,       t_c = 439.793224 s,     speedup 269.589640 x

size = 6912,    t_g = 1.631370 s,       t_c = 456.513014 s,     speedup 279.834193 x

size = 7168,    t_g = 1.631020 s,       t_c = 473.778714 s,     speedup 290.480044 x

size = 7424,    t_g = 1.631062 s,       t_c = 490.256361 s,     speedup 300.574921 x

size = 7680,    t_g = 1.630970 s,       t_c = 507.460173 s,     speedup 311.140061 x

size = 7936,    t_g = 1.630954 s,       t_c = 524.421227 s,     speedup 321.542557 x

size = 8192,    t_g = 1.631378 s,       t_c = 541.475737 s,     speedup 331.913054 x

size = 8448,    t_g = 2.453411 s,       t_c = 558.613681 s,     speedup 227.688544 x

size = 8704,    t_g = 2.453810 s,       t_c = 575.858433 s,     speedup 234.679327 x

size = 8960,    t_g = 2.453524 s,       t_c = 591.847034 s,     speedup 241.223286 x

size = 9216,    t_g = 2.453523 s,       t_c = 609.534515 s,     speedup 248.432369 x

size = 9472,    t_g = 2.453522 s,       t_c = 626.609819 s,     speedup 255.391993 x

size = 9728,    t_g = 2.453511 s,       t_c = 643.998719 s,     speedup 262.480438 x

GPUs are “wicked fast” running functions that are heavily used within shaders. Your kernel uses powf(), so I’m not surprised if this is a source of huge speedup on the GPU. In some of the CUDA talks I’ve given, I’ve pointed out that GPUs are especially fast for things like this that show up in shaders. I’ve heard of even larger speedups for a few codes that happen to use various other transcendental and related functions.

Cheers,

John Stone

Sweet, that means that my finite differencing code will be super fast then. :D

Don’t be too excited. It can be very fast for embarrassing parallel problem, but not that good for general algorithm. For some more complicated functions, that is normally you main bottle neck, if you can rearch five or six time faster that can be considered a success, and it is really hard to debug right now. Some time when i get more than 100 times faster, i should check the results first, it may produce right result if you debug only some thousandths elements but for some unknown reason produce completely crap with big number.