How to verify GPU Performance with CPU Performance

Rajakumar8 · November 22, 2013, 8:33am

I am having Testla K10 for GPU computing. When I execute the CPU and GPU program given by NVidia Cuda Sample it is executing both the programs at the same time. Is there any way to find whether GPU Computing using Tesla K10 is faster than CPU Computing. Trying to verify the Mathematical Computing using Tesla K10. I am confused whether it is utilizing the full potential of the GPU.
The machine runs in Windows 7 64 bit, I7 processor, 16 GB RAM and it is having on board Video Card Intel HD Graphics 4000.

The Following .net CPP Code is used for verification of GPU

#include “cuda_runtime.h”
#include “device_launch_parameters.h”
#include <stdio.h>
#include <conio.h>
#define N 80000
global void add( int *a, int *b, int *c )
{
int tid = blockIdx.x; // this thread handles the data at its thread id
if (tid < N)
c[tid] = a[tid] + b[tid];
}

int main( void ) {
int a[N], b[N], c[N];
int *dev_a, *dev_b, *dev_c;

// allocate the memory on the GPU
cudaMalloc( (void**)&dev_a, N * sizeof(int) );
cudaMalloc( (void**)&dev_b, N * sizeof(int) );
cudaMalloc( (void**)&dev_c, N * sizeof(int) );

// fill the arrays 'a' and 'b' on the CPU
for (int i=0; i<N; i++) {
    a[i] = i;
    b[i] = i * i;
}

// copy the arrays 'a' and 'b' to the GPU
cudaMemcpy( dev_a, a, N * sizeof(int),
                          cudaMemcpyHostToDevice );
cudaMemcpy( dev_b, b, N * sizeof(int),
                          cudaMemcpyHostToDevice );

add<<<N,1>>>( dev_a, dev_b, dev_c );

cudaThreadSynchronize();

// copy the array 'c' back from the GPU to the CPU
cudaMemcpy( c, dev_c, N * sizeof(int),
                          cudaMemcpyDeviceToHost );

// display the results
for (int i=0; i<N; i++) {
    printf( "%d + %d = %d\n", a[i], b[i], c[i] );
}
printf("Over");
// free the memory allocated on the GPU
cudaFree( dev_a );
cudaFree( dev_b );
cudaFree( dev_c );

getch();

return 0;

}

The Following .net CPP Code is used for verification of CPU

#include <stdio.h>
#include <conio.h>

#define N 80000

void add( int *a, int *b, int *c ) {
int tid = 0; // this is CPU zero, so we start at zero
while (tid < N) {
c[tid] = a[tid] + b[tid];
tid += 1; // we have one CPU, so we increment by one
}
printf(“over”);
}

int main( void ) {
int a[N], b[N], c[N];

// fill the arrays 'a' and 'b' on the CPU
for (int i=0; i<N; i++) {
    a[i] = i;
    b[i] = i * i;
}

add( a, b, c );

// display the results
for (int i=0; i<N; i++) {
    printf( "%d + %d = %d\n", a[i], b[i], c[i] );
}
 
_getch();

return 0;

}