gpu performance tester weird gpu results

Sam_Adams · December 19, 2007, 11:43pm

I wrote this simple code today to do some basic benchmarking. Basically all that is does is calculate pi in three diffrent ways for each element of a stream (x,y,z compnents) on the gpu and the cpu. The bigger the stream, the longer it take on the CPU, but for some reason, it looks like it is taking the same amount of time on the GPU regradless of the sream size… What am I doing wrong?

This is the output

size = 256,     t_g = 0.809449 s,       t_c = 17.136713 s,      speedup 21.170845 x

size = 512,     t_g = 0.808610 s,       t_c = 34.299225 s,      speedup 42.417538 x

size = 768,     t_g = 0.809715 s,       t_c = 51.471620 s,      speedup 63.567544 x

size = 1024,    t_g = 0.808499 s,       t_c = 68.520511 s,      speedup 84.750234 x

size = 1280,    t_g = 0.809134 s,       t_c = 85.705849 s,      speedup 105.922963 x

size = 1536,    t_g = 0.808985 s,       t_c = 102.908285 s,     speedup 127.206653 x

size = 1792,    t_g = 0.808505 s,       t_c = 119.568844 s,     speedup 147.888785 x

size = 2048,    t_g = 0.807108 s,       t_c = 135.830341 s,     speedup 168.292548 x

size = 2304,    t_g = 0.808790 s,       t_c = 153.502859 s,     speedup 189.793114 x

size = 2560,    t_g = 0.807486 s,       t_c = 169.191967 s,     speedup 209.529397 x

fdtd complete: total runtime = 946.580994s, gpu = 8.086282s, cpu = 938.136230s, speedup = 116.015770 x

Here is the code

#include <stdio.h>

#include <stdlib.h>

#include <string.h>

#include <time.h>

#include <cuda_runtime_api.h>

#define PI_50 3.14159265358979323846264338327950288419716939937510

__device__ float4 *devMem = NULL;

int dim_x = 256;

__global__ void init(float4 *init_me){

        int idx = blockDim.x * blockIdx.x + threadIdx.x;

       init_me[idx].x = 0.0;

        init_me[idx].y = 0.0;

        init_me[idx].z = 0.0;

        init_me[idx].w = 0.0;

}

__global__ void plusOne(float4 *add_to_me){ 

        int idx = blockDim.x * blockIdx.x + threadIdx.x;

        add_to_me[idx].x += 1;

}

__host__ __device__ float bellardBBPTypeFormula(){

        int i;

        float pi = 0.0;

       for(i = 0; i < 10000; i++){

                pi += ( \

                        (powf(-1.0, (float)i) / powf(2.0, 10.0 * (float)i)) * \

                        (       -(powf(2.0, 5.0) / (4.0 * (float)i + 1.0)) \

                                -(1.0 / (4.0 * (float)i + 3.0)) \

                                +(powf(2.0, 8.0) / (10.0 * (float)i + 1.0)) \

                                -(powf(2.0, 6.0) / (10.0 * (float)i + 3.0)) \

                                -(powf(2.0, 2.0) / (10.0 * (float)i + 5.0)) \

                                -(powf(2.0, 2.0) / (10.0 * (float)i + 7.0)) \

                                +(1.0 / (10.0 * (float)i + 9.0)) \

                        ) \

                        );

        }

        pi *= (1.0 / powf(2.0, 6.0));

        return pi;

}__host__ __device__ float gregoryLeibnizSeries(){

        int i;

        float pi = 0.0;

        for(i = 0; i < 1000000; i++){

                pi += powf(-1.0, (float)i) * (4.0 / (1.0 + 2.0 * (float)i));

        }

        return pi;

}

__host__ __device__ float johnMachinMethod(){

        return 4.0 * (4.0 * atanf(1.0 / 5.0) - atanf(1.0 / 239.0));

}

__global__ void calcPi(float4 *pi){

        int idx = blockDim.x * blockIdx.x + threadIdx.x;

       pi[idx].x = (float)idx * johnMachinMethod();

        pi[idx].y = (float)idx * gregoryLeibnizSeries();

        pi[idx].z = (float)idx * bellardBBPTypeFormula();

        pi[idx].w = (float)idx * bellardBBPTypeFormula();

}

int main(int argc, char **argv){

        struct timespec start, stop;

        struct timespec gpu_start, gpu_stop;

        struct timespec cpu_start, cpu_stop;

        int i;

        int j;

        float runtime = 0.0, gpu_runtime = 0.0, cpu_runtime = 0.0;

        float4 *result = NULL;

        float4 *hostMem = NULL;

        double t_c, t_g;

       clock_gettime(CLOCK_PROCESS_CPUTIME_ID,&start);

       for(j = 1; j <= 10; j++){

               cudaFree(devMem);

                free(result);

                free(hostMem);

               clock_gettime(CLOCK_PROCESS_CPUTIME_ID,&gpu_start);

                result = (float4*)calloc(sizeof(float4),dim_x*j);

                cudaMalloc((void**)&devMem, j*dim_x*sizeof(float4));

                init<<<j,dim_x>>>(devMem);

                calcPi<<<j,dim_x>>>(devMem);

                cudaMemcpy(result, devMem, j*dim_x*sizeof(float4), cudaMemcpyDeviceToHost);

                clock_gettime(CLOCK_PROCESS_CPUTIME_ID,&gpu_stop);

               clock_gettime(CLOCK_PROCESS_CPUTIME_ID,&cpu_start);

                hostMem = (float4*)calloc(sizeof(float4),dim_x*j);

                for(i = 0; i < dim_x*j; i++){

                        hostMem[i].x = (float)i + johnMachinMethod();

                        hostMem[i].y = (float)i + gregoryLeibnizSeries();

                        hostMem[i].z = (float)i + bellardBBPTypeFormula();

                        hostMem[i].w = (float)i + bellardBBPTypeFormula();

                }

                clock_gettime(CLOCK_PROCESS_CPUTIME_ID,&cpu_stop);

               for(i = 0; i < dim_x*j; i++){

                        if(abs(hostMem[i].x - result[i].x) > 0.01) fprintf(stderr,"%i)X) greater than 1%% (%f) diffrence between cpu (%f) and gpu (%f)\n",i,abs(hostMem[i].x - result[i].x), hostMem[i].x, result[i].x);

                        if(abs(hostMem[i].y - result[i].y) > 0.01) fprintf(stderr,"%i)Y) greater than 1%% (%f) diffrence between cpu (%f) and gpu (%f)\n",i,abs(hostMem[i].y - result[i].y), hostMem[i].y, result[i].y);

                        if(abs(hostMem[i].z - result[i].z) > 0.01) fprintf(stderr,"%i)Z) greater than 1%% (%f) diffrence between cpu (%f) and gpu (%f)\n",i,abs(hostMem[i].z - result[i].z), hostMem[i].z, result[i].z);

                        if(abs(hostMem[i].w - result[i].w) > 0.01) fprintf(stderr,"%i)W) greater than 1%% (%f) diffrence between cpu (%f) and gpu (%f)\n",i,abs(hostMem[i].w - result[i].w), hostMem[i].w, result[i].w);

                }

               t_c = (double)(cpu_stop.tv_sec - cpu_start.tv_sec) + (double)(cpu_stop.tv_nsec - cpu_start.tv_nsec)/1000000000.0;

                t_g = (double)(gpu_stop.tv_sec - gpu_start.tv_sec) + (double)(gpu_stop.tv_nsec - gpu_start.tv_nsec)/1000000000.0;

               cpu_runtime += t_c;

                gpu_runtime += t_g;

               printf("size = %i,\tt_g = %f s,\tt_c = %f s,\tspeedup %f x\n",j*dim_x, t_g, t_c, t_c/t_g);

        }

       clock_gettime(CLOCK_PROCESS_CPUTIME_ID,&stop);

        runtime = (double)(stop.tv_sec - start.tv_sec) + (double)(stop.tv_nsec - start.tv_nsec)/1000000000.0;

       printf("fdtd complete: total runtime = %fs, gpu = %fs, cpu = %fs, speedup = %f x\n",runtime, gpu_runtime, cpu_runtime, cpu_runtime/gpu_runtime);

        return 0;

}

seb · December 20, 2007, 7:37am

It’s not a bug, it’s a feature. No really!

When you increase the size of the stream you distribute the work pretty smart on all the GPU processors so the more you add the more the GPU calculates in parallel and the speedup increases.

For every iteration of for(j = 1; j <= 10; j++) you assign the blocksize j to the kernel (calcPi<<<j,dim_x>>>(devMem);). So you increase the number of blocks while increasing the streamsize. And blocks are the entities the GPU can execute in parallel.

It also can execute threads in parallel but that’s one level below blocks and not the significant factor in this case.

For j=1 you start 1 block. This block will be assigned to one multiprocessor and the multiprocessor will calculate the block. For j=2 you start 2 blocks and those will be assigned to multiprocessors so that they can run in a parallel manner. You double the work and the time stays the same. You can do this as long as the GPU has capacities left to parallelize. At some point the GPU will have to serialize some blocks and then your execution time should increase.

Another way to see this is that when you run the kernel with j=1 the GPU will not be working at full capacity, not even half of it’s capacity. This has to do with occupancy - you could have a look at the occupancy calculator to find out more about how the GPU works.

Sam_Adams · December 20, 2007, 3:22pm

It’s not a bug, it’s a feature. No really!

When you increase the size of the stream you distribute the work pretty smart on all the GPU processors so the more you add the more the GPU calculates in parallel and the speedup increases.

For every iteration of for(j = 1; j <= 10; j++) you assign the blocksize j to the kernel (calcPi<<<j,dim_x>>>(devMem);). So you increase the number of blocks while increasing the streamsize. And blocks are the entities the GPU can execute in parallel.

It also can execute threads in parallel but that’s one level below blocks and not the significant factor in this case.

For j=1 you start 1 block. This block will be assigned to one multiprocessor and the multiprocessor will calculate the block. For j=2 you start 2 blocks and those will be assigned to multiprocessors so that they can run in a parallel manner. You double the work and the time stays the same. You can do this as long as the GPU has capacities left to parallelize. At some point the GPU will have to serialize some blocks and then your execution time should increase.

Another way to see this is that when you run the kernel with j=1 the GPU will not be working at full capacity, not even half of it’s capacity. This has to do with occupancy - you could have a look at the occupancy calculator to find out more about how the GPU works.

[snapback]297238[/snapback]

Thanks, I guess I wasn’t realizing that 1 block is for 1 multi processor, and since I was only running up to 10 blocks, I wasn’t saturating my GPU (8800 Ultra) yet. I would suppose that after 16 blocks, I will reach the saturatoin point and not all of the blocks can be executed in parallel.

seb · December 20, 2007, 3:28pm

Yes and no. One multiprocessor can actually calculate more than 1 block at a time, depending on the number of registers you use per thread and the number of threads you run per block. Heres the occupancy calculator: http://forums.nvidia.com/index.php?showtopic=31279

seibert · December 20, 2007, 7:10pm

As seb mentioned, you definitely want to have more blocks than multiprocessors. This allows the scheduler to hide memory latency by running one block while the other block is waiting for global reads. It can make a surprising difference sometimes.

Sam_Adams · December 20, 2007, 7:11pm

I have been running for more time, and looks like it is starting to converge to a peak speedup of about 333x. This is a 8800 Ultra vs Core 2 Quad @ 2.4 GHz. Does 333x speed up seems about right? Only one core is running on the CPU.

nvcc -c howto_streams.cu -I. -I/usr/local/cuda/include

g++ -o stream_tester  howto_streams.o  -lrt -lm -lcudart -I. -I/usr/local/cuda/include -L/usr/local/cuda/lib

LD_LIBRARY_PATH=/usr/local/cuda/lib ./stream_tester

size = 256,     t_g = 0.809483 s,       t_c = 16.934240 s,      speedup 20.919817 x

size = 512,     t_g = 0.808452 s,       t_c = 33.955900 s,      speedup 42.001140 x

size = 768,     t_g = 0.808969 s,       t_c = 50.626780 s,      speedup 62.581872 x

size = 1024,    t_g = 0.808982 s,       t_c = 67.420405 s,      speedup 83.339804 x

size = 1280,    t_g = 0.811770 s,       t_c = 84.882961 s,      speedup 104.565313 x

size = 1536,    t_g = 0.808998 s,       t_c = 101.534036 s,     speedup 125.505889 x

size = 1792,    t_g = 0.809008 s,       t_c = 118.700908 s,     speedup 146.723971 x

size = 2048,    t_g = 0.809000 s,       t_c = 135.407021 s,     speedup 167.375696 x

size = 2304,    t_g = 0.809012 s,       t_c = 152.629377 s,     speedup 188.661365 x

size = 2560,    t_g = 0.809042 s,       t_c = 169.134643 s,     speedup 209.055489 x

size = 2816,    t_g = 0.809014 s,       t_c = 186.648482 s,     speedup 230.711134 x

size = 3072,    t_g = 0.809033 s,       t_c = 203.437381 s,     speedup 251.457596 x

size = 3328,    t_g = 0.808504 s,       t_c = 219.850719 s,     speedup 271.922960 x

size = 3584,    t_g = 0.809394 s,       t_c = 237.406458 s,     speedup 293.313998 x

size = 3840,    t_g = 0.809031 s,       t_c = 253.833718 s,     speedup 313.750235 x

size = 4096,    t_g = 0.808546 s,       t_c = 270.267248 s,     speedup 334.263301 x

size = 4352,    t_g = 1.630859 s,       t_c = 287.655516 s,     speedup 176.382814 x

size = 4608,    t_g = 1.630898 s,       t_c = 304.711656 s,     speedup 186.836690 x

size = 4864,    t_g = 1.630894 s,       t_c = 321.097602 s,     speedup 196.884449 x

size = 5120,    t_g = 1.630973 s,       t_c = 338.058857 s,     speedup 207.274397 x

size = 5376,    t_g = 1.630921 s,       t_c = 355.583852 s,     speedup 218.026396 x

size = 5632,    t_g = 1.630957 s,       t_c = 372.167579 s,     speedup 228.189651 x

size = 5888,    t_g = 1.631412 s,       t_c = 389.079219 s,     speedup 238.492327 x

size = 6144,    t_g = 1.631346 s,       t_c = 405.546981 s,     speedup 248.596607 x

size = 6400,    t_g = 1.630960 s,       t_c = 423.331917 s,     speedup 259.559902 x

size = 6656,    t_g = 1.631343 s,       t_c = 439.793224 s,     speedup 269.589640 x

size = 6912,    t_g = 1.631370 s,       t_c = 456.513014 s,     speedup 279.834193 x

size = 7168,    t_g = 1.631020 s,       t_c = 473.778714 s,     speedup 290.480044 x

size = 7424,    t_g = 1.631062 s,       t_c = 490.256361 s,     speedup 300.574921 x

size = 7680,    t_g = 1.630970 s,       t_c = 507.460173 s,     speedup 311.140061 x

size = 7936,    t_g = 1.630954 s,       t_c = 524.421227 s,     speedup 321.542557 x

size = 8192,    t_g = 1.631378 s,       t_c = 541.475737 s,     speedup 331.913054 x

size = 8448,    t_g = 2.453411 s,       t_c = 558.613681 s,     speedup 227.688544 x

size = 8704,    t_g = 2.453810 s,       t_c = 575.858433 s,     speedup 234.679327 x

size = 8960,    t_g = 2.453524 s,       t_c = 591.847034 s,     speedup 241.223286 x

size = 9216,    t_g = 2.453523 s,       t_c = 609.534515 s,     speedup 248.432369 x

size = 9472,    t_g = 2.453522 s,       t_c = 626.609819 s,     speedup 255.391993 x

size = 9728,    t_g = 2.453511 s,       t_c = 643.998719 s,     speedup 262.480438 x

tachyon_john · December 21, 2007, 5:22am

GPUs are “wicked fast” running functions that are heavily used within shaders. Your kernel uses powf(), so I’m not surprised if this is a source of huge speedup on the GPU. In some of the CUDA talks I’ve given, I’ve pointed out that GPUs are especially fast for things like this that show up in shaders. I’ve heard of even larger speedups for a few codes that happen to use various other transcendental and related functions.

Cheers,

John Stone

Sam_Adams · December 21, 2007, 4:05pm

Sweet, that means that my finite differencing code will be super fast then. :D

Linh_Ha · December 21, 2007, 6:49pm

Don’t be too excited. It can be very fast for embarrassing parallel problem, but not that good for general algorithm. For some more complicated functions, that is normally you main bottle neck, if you can rearch five or six time faster that can be considered a success, and it is really hard to debug right now. Some time when i get more than 100 times faster, i should check the results first, it may produce right result if you debug only some thousandths elements but for some unknown reason produce completely crap with big number.

Topic		Replies	Views
Confused about GPU vs CPU speed in multiplication CUDA Programming and Performance	8	6567	February 19, 2009
Cuda code performance CUDA Programming and Performance	14	3182	December 16, 2014
Outrageous speedups over serial code: advice needed CUDA Programming and Performance	17	2818	November 18, 2012
Performance degradation as task size grows CUDA Programming and Performance	13	650	April 25, 2023
Stream Benchmark CUDA Programming and Performance	20	17173	March 15, 2023
Is CUDA really that fast? CUDA Programming and Performance	17	11742	September 21, 2009
How to get more Gflops ? :) CUDA Programming and Performance	21	27628	September 12, 2008
Multi-GPUs not of the same type, how to evenly split work among them? CUDA Programming and Performance	12	2500	April 2, 2015
300x to 600x times faster... really? CUDA Programming and Performance	92	34511	February 8, 2010
Strange FLOP counts CUDA Programming and Performance	21	10104	March 15, 2008

gpu performance tester weird gpu results

Related topics