I wrote this simple code today to do some basic benchmarking. Basically all that is does is calculate pi in three diffrent ways for each element of a stream (x,y,z compnents) on the gpu and the cpu. The bigger the stream, the longer it take on the CPU, but for some reason, it looks like it is taking the same amount of time on the GPU regradless of the sream size… What am I doing wrong?
This is the output
size = 256, t_g = 0.809449 s, t_c = 17.136713 s, speedup 21.170845 x
size = 512, t_g = 0.808610 s, t_c = 34.299225 s, speedup 42.417538 x
size = 768, t_g = 0.809715 s, t_c = 51.471620 s, speedup 63.567544 x
size = 1024, t_g = 0.808499 s, t_c = 68.520511 s, speedup 84.750234 x
size = 1280, t_g = 0.809134 s, t_c = 85.705849 s, speedup 105.922963 x
size = 1536, t_g = 0.808985 s, t_c = 102.908285 s, speedup 127.206653 x
size = 1792, t_g = 0.808505 s, t_c = 119.568844 s, speedup 147.888785 x
size = 2048, t_g = 0.807108 s, t_c = 135.830341 s, speedup 168.292548 x
size = 2304, t_g = 0.808790 s, t_c = 153.502859 s, speedup 189.793114 x
size = 2560, t_g = 0.807486 s, t_c = 169.191967 s, speedup 209.529397 x
fdtd complete: total runtime = 946.580994s, gpu = 8.086282s, cpu = 938.136230s, speedup = 116.015770 x
Here is the code
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <cuda_runtime_api.h>
#define PI_50 3.14159265358979323846264338327950288419716939937510
__device__ float4 *devMem = NULL;
int dim_x = 256;
__global__ void init(float4 *init_me){
int idx = blockDim.x * blockIdx.x + threadIdx.x;
init_me[idx].x = 0.0;
init_me[idx].y = 0.0;
init_me[idx].z = 0.0;
init_me[idx].w = 0.0;
}
__global__ void plusOne(float4 *add_to_me){
int idx = blockDim.x * blockIdx.x + threadIdx.x;
add_to_me[idx].x += 1;
}
__host__ __device__ float bellardBBPTypeFormula(){
int i;
float pi = 0.0;
for(i = 0; i < 10000; i++){
pi += ( \
(powf(-1.0, (float)i) / powf(2.0, 10.0 * (float)i)) * \
( -(powf(2.0, 5.0) / (4.0 * (float)i + 1.0)) \
-(1.0 / (4.0 * (float)i + 3.0)) \
+(powf(2.0, 8.0) / (10.0 * (float)i + 1.0)) \
-(powf(2.0, 6.0) / (10.0 * (float)i + 3.0)) \
-(powf(2.0, 2.0) / (10.0 * (float)i + 5.0)) \
-(powf(2.0, 2.0) / (10.0 * (float)i + 7.0)) \
+(1.0 / (10.0 * (float)i + 9.0)) \
) \
);
}
pi *= (1.0 / powf(2.0, 6.0));
return pi;
}__host__ __device__ float gregoryLeibnizSeries(){
int i;
float pi = 0.0;
for(i = 0; i < 1000000; i++){
pi += powf(-1.0, (float)i) * (4.0 / (1.0 + 2.0 * (float)i));
}
return pi;
}
__host__ __device__ float johnMachinMethod(){
return 4.0 * (4.0 * atanf(1.0 / 5.0) - atanf(1.0 / 239.0));
}
__global__ void calcPi(float4 *pi){
int idx = blockDim.x * blockIdx.x + threadIdx.x;
pi[idx].x = (float)idx * johnMachinMethod();
pi[idx].y = (float)idx * gregoryLeibnizSeries();
pi[idx].z = (float)idx * bellardBBPTypeFormula();
pi[idx].w = (float)idx * bellardBBPTypeFormula();
}
int main(int argc, char **argv){
struct timespec start, stop;
struct timespec gpu_start, gpu_stop;
struct timespec cpu_start, cpu_stop;
int i;
int j;
float runtime = 0.0, gpu_runtime = 0.0, cpu_runtime = 0.0;
float4 *result = NULL;
float4 *hostMem = NULL;
double t_c, t_g;
clock_gettime(CLOCK_PROCESS_CPUTIME_ID,&start);
for(j = 1; j <= 10; j++){
cudaFree(devMem);
free(result);
free(hostMem);
clock_gettime(CLOCK_PROCESS_CPUTIME_ID,&gpu_start);
result = (float4*)calloc(sizeof(float4),dim_x*j);
cudaMalloc((void**)&devMem, j*dim_x*sizeof(float4));
init<<<j,dim_x>>>(devMem);
calcPi<<<j,dim_x>>>(devMem);
cudaMemcpy(result, devMem, j*dim_x*sizeof(float4), cudaMemcpyDeviceToHost);
clock_gettime(CLOCK_PROCESS_CPUTIME_ID,&gpu_stop);
clock_gettime(CLOCK_PROCESS_CPUTIME_ID,&cpu_start);
hostMem = (float4*)calloc(sizeof(float4),dim_x*j);
for(i = 0; i < dim_x*j; i++){
hostMem[i].x = (float)i + johnMachinMethod();
hostMem[i].y = (float)i + gregoryLeibnizSeries();
hostMem[i].z = (float)i + bellardBBPTypeFormula();
hostMem[i].w = (float)i + bellardBBPTypeFormula();
}
clock_gettime(CLOCK_PROCESS_CPUTIME_ID,&cpu_stop);
for(i = 0; i < dim_x*j; i++){
if(abs(hostMem[i].x - result[i].x) > 0.01) fprintf(stderr,"%i)X) greater than 1%% (%f) diffrence between cpu (%f) and gpu (%f)\n",i,abs(hostMem[i].x - result[i].x), hostMem[i].x, result[i].x);
if(abs(hostMem[i].y - result[i].y) > 0.01) fprintf(stderr,"%i)Y) greater than 1%% (%f) diffrence between cpu (%f) and gpu (%f)\n",i,abs(hostMem[i].y - result[i].y), hostMem[i].y, result[i].y);
if(abs(hostMem[i].z - result[i].z) > 0.01) fprintf(stderr,"%i)Z) greater than 1%% (%f) diffrence between cpu (%f) and gpu (%f)\n",i,abs(hostMem[i].z - result[i].z), hostMem[i].z, result[i].z);
if(abs(hostMem[i].w - result[i].w) > 0.01) fprintf(stderr,"%i)W) greater than 1%% (%f) diffrence between cpu (%f) and gpu (%f)\n",i,abs(hostMem[i].w - result[i].w), hostMem[i].w, result[i].w);
}
t_c = (double)(cpu_stop.tv_sec - cpu_start.tv_sec) + (double)(cpu_stop.tv_nsec - cpu_start.tv_nsec)/1000000000.0;
t_g = (double)(gpu_stop.tv_sec - gpu_start.tv_sec) + (double)(gpu_stop.tv_nsec - gpu_start.tv_nsec)/1000000000.0;
cpu_runtime += t_c;
gpu_runtime += t_g;
printf("size = %i,\tt_g = %f s,\tt_c = %f s,\tspeedup %f x\n",j*dim_x, t_g, t_c, t_c/t_g);
}
clock_gettime(CLOCK_PROCESS_CPUTIME_ID,&stop);
runtime = (double)(stop.tv_sec - start.tv_sec) + (double)(stop.tv_nsec - start.tv_nsec)/1000000000.0;
printf("fdtd complete: total runtime = %fs, gpu = %fs, cpu = %fs, speedup = %f x\n",runtime, gpu_runtime, cpu_runtime, cpu_runtime/gpu_runtime);
return 0;
}