I’m relatively new to CUDA. Been dabbling in GPGPU for some time now, but just recently switched from ATI to NVIDIA, so the architecture and API are still a bit fresh to me.

So I’ve written a simple test program which generates pi using a monte carlo method. It’s a very inefficient way to find an approximation of pi, but it works well for a benchmark, and it’s easily parallelizable.

So I wanted to write an implementation of this pi approximation using both vanilla C++ and CUDA acceleration, to see what performance difference I could get out of the two.

Here is the vanilla code, to give you an idea of what this does:

```
#include <iostream>
#include <cstdlib>
#include <cmath>
#include <ctime>
using namespace std;
#define iterations 1000000000
int withinCircle(float& a, float& b)
{
return (sqrt((a * a) + (b * b)) <= 1.0f);
}
void randomize(float* a, size_t size)
{
for(size_t i = 0; i < size; i++) a[i] = (rand() / (float)RAND_MAX);
}
int main()
{
int gridSize = 1024;
int numGrids = iterations / gridSize + ((iterations % gridSize) ? 1 : 0);
float* a = new float[gridSize];
float* b = new float[gridSize];
unsigned int hits = 0;
srand(time(NULL));
for(int i = 0; i < numGrids; i++)
{
randomize(a, gridSize);
randomize(b, gridSize);
for(int j = 0; j < gridSize; j++) hits += withinCircle(a[j], b[j]);
}
cout << "Pi = " << (hits * 4) / (float)iterations << endl;
delete [] a;
delete [] b;
}
```

I then modified this to use GPU acceleration:

```
#include <iostream>
#include <cmath>
#include <ctime>
#include <cstdlib>
#define iterations 1000000000
using namespace std;
__global__ void withinCircle(float* a, float* b, float* c)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
float a_t = a[idx], b_t = b[idx];
c[idx] = sqrt((a_t * a_t) + (b_t * b_t));
}
void randomize(float* a, size_t size)
{
for(int i = 0; i < size; i++)
a[i] = (rand() / (float)RAND_MAX);
}
int main()
{
srand(time(NULL));
unsigned long long hits = 0;
int blocksPerGrid = 16384;
int threadsPerBlock = 512;
int gridSize = blocksPerGrid * threadsPerBlock;
int numGrids = iterations / gridSize + ((iterations % gridSize) ? 1 : 0);
cout << numGrids << " Grids,\n" << blocksPerGrid << " Blocks per grid,\n" <<
threadsPerBlock << " Threads per block\n" << endl;
float* a_h = (float*)malloc(sizeof(float) * gridSize);
float* b_h = (float*)malloc(sizeof(float) * gridSize);
float* c_h = (float*)malloc(sizeof(float) * gridSize);
float *a_d, *b_d, *c_d;
cudaMalloc(&a_d, sizeof(float) * gridSize);
cudaMalloc(&b_d, sizeof(float) * gridSize);
cudaMalloc(&c_d, sizeof(float) * gridSize);
for(int i = 0; i < numGrids; i++)
{
randomize(a_h, gridSize);
randomize(b_h, gridSize);
cudaMemcpy(a_d, a_h, sizeof(float) * gridSize, cudaMemcpyHostToDevice);
cudaMemcpy(b_d, b_h, sizeof(float) * gridSize, cudaMemcpyHostToDevice);
withinCircle<<<blocksPerGrid, threadsPerBlock>>>(a_d, b_d, c_d);
cudaThreadSynchronize();
cudaMemcpy(c_h, c_d, sizeof(float) * gridSize, cudaMemcpyDeviceToHost);
for(int i = 0; i < gridSize; i++) hits += (c_h[i] <= 1.0f);
}
cudaFree(a_d);
cudaFree(b_d);
cudaFree(c_d);
free(a_h);
free(b_h);
free(c_h);
cout << hits << " hits, " << iterations << " iterations" << endl;
cout << "Pi = " << (hits * 4) / (float)iterations << endl;
}
```

Both work as they should. Now, the single-threaded vanilla processor-only code completes in about 26 seconds. The CUDA accelerated version completes in 29 seconds. I know my CUDA accelerated version is far from being well optimized, so my question is how exactly you would optimize this. I know memory accesses are a huge performance killer with CUDA, and GPGPU in general, but with a kernel this simple, I really can’t quite understand how to improve memory access times.

Any tips on what you would do to optimize this would be greatly appreciated. This example program doesn’t have any real world application, but I’m trying to get a grasp in CUDA and the architecture through writing and optimizing this simple stuff.

Cheers.