Hello everybody!

I am trying to compare CPU to GPU speedup and precision of computation using a simple code.

I solve the heat equation with finite difference and explicit schemes in 2D. I do 1000 time iterations and compare the sum of the 2D array (I’m interested by the precision).

For the speedup, I just capture the time of execution of the program (linux ‘time’).

You’ll find below the cpp and cu files. The instruction of compilation are commented in the first line.

I use CUDA 8.0 for the .cu files and IntelC++Compiler v17 for the .cpp file.

I am on MacOS with an Intel i7 3.3Ghz and a Nvidia GT650m.

main.cpp

```
// /opt/intel/bin/icpc -O2 -fp-model precise main.cpp && time ./a.out
#include <iostream>
#include <cmath>
#include <iomanip>
int main()
{
const int nx = 2800;
const int ny = 2800;
const float lx = 1.f;
const float ly = 1.f;
float dx = lx / (float)nx;
float dy = ly / (float)ny;
const float lambda = 0.001f;
float dt = 0.01f/lambda/(1.0f/(dx*dx)/12.f+1.0f/(dy*dy)/12.f);
float dtdx = dt*lambda/dx/dx/12.f;
float dtdy = dt*lambda/dy/dy/12.f;
int nxt = nx + 4;
int nyt = ny + 4;
float* T = new float[nxt*nyt];
float* Told = new float[nxt*nyt];
for (int i = 0; i < nxt; i++) {
for (int j = 0; j < nyt; j++) {
int ind = i*nyt + j;
T[ind] = 1.0f + exp(-32.f*(pow((float)(j-nyt/2)/(float)nyt,2)+pow((float)(i-nxt/2)/(float)nxt,2)));
Told[ind] = T[ind];
}
}
double time = std::clock();
for (int step=0; step<1000; step++) {
for (int i = 2; i < nxt-2; i++) {
for (int j = 2; j < nyt-2; j++) {
int ind = i*nyt + j;
T[ind] = Told[ind] + dtdx * (-Told[(i-2)*nyt + (j)]+16.f*Told[(i-1)*nyt + (j)]-30.f*Told[(i)*nyt + (j)]+16.f*Told[(i+1)*nyt + (j)]-Told[(i+2)*nyt + (j)])
+ dtdy * (-Told[(i)*nyt + (j-2)]+16.f*Told[(i)*nyt + (j-1)]-30.f*Told[(i)*nyt + (j)]+16.f*Told[(i)*nyt + (j+1)]-Told[(i)*nyt + (j+2)]);
}
}
for (int i = 0; i < nxt*nyt; i++) {
Told[i] = T[i];
}
}
//std::cout << 1000.0*(std::clock()-time)/(double) CLOCKS_PER_SEC << " ms" << std::endl;
float sum = 0.0f;
for (int i = 0; i < nxt*nyt; i++) {
sum += T[i];
}
std::cout << "Sum: " << std::setprecision(10) << sum/(float)(nxt*nyt) << std::endl;
return 0;
}
```

main.cu

```
// /usr/local/cuda/bin/nvcc --gpu-architecture=sm_30 -O2 main.cu && time ./a.out
#include <cuda.h>
#include <iostream>
#include <cmath>
#include <iomanip>
__global__ void solve(float* T, float dtdx, float dtdy, int N)
{
int indX = blockIdx.x * (blockDim.x-4) + threadIdx.x;
int indY = blockIdx.y * (blockDim.y-4) + threadIdx.y;
__shared__ float temp[32*32*6];
for (int step=0; step<1000; step++) {
temp[threadIdx.x*32+threadIdx.y] = T[indX * N + indY];
__syncthreads();
if(threadIdx.x > 1 && threadIdx.x < (blockDim.x-2) && threadIdx.y > 1 && threadIdx.y < (blockDim.y-2))
T[indX * N + indY] = temp[(threadIdx.x)*32+(threadIdx.y)] + dtdx * (-temp[(threadIdx.x-2)*32+(threadIdx.y)]+16.f*temp[(threadIdx.x-1)*32+(threadIdx.y)]-30.f*temp[(threadIdx.x)*32+(threadIdx.y)]+16.f*temp[(threadIdx.x+1)*32+(threadIdx.y)]-temp[(threadIdx.x+2)*32+(threadIdx.y)])
+ dtdy * (-temp[(threadIdx.x)*32+(threadIdx.y-2)]+16.f*temp[(threadIdx.x)*32+(threadIdx.y-1)]-30.f*temp[(threadIdx.x)*32+(threadIdx.y)]+16.f*temp[(threadIdx.x)*32+(threadIdx.y+1)]-temp[(threadIdx.x)*32+(threadIdx.y+2)]);
}
}
int main()
{
const int nx = 2800;
const int ny = 2800;
const float lx = 1.f;
const float ly = 1.f;
float dx = lx / (float)nx;
float dy = ly / (float)ny;
const float lambda = 0.001f;
float dt = 0.01f/lambda/(1.0f/(dx*dx)/12.f+1.0f/(dy*dy)/12.f);
float dtdx = dt*lambda/dx/dx/12.f;
float dtdy = dt*lambda/dy/dy/12.f;
int nxt = nx + 4;
int nyt = ny + 4;
float* T = new float[nxt*nyt];
//float* Told = new float[nxt*nyt];
for (int i = 0; i < nxt; i++) {
for (int j = 0; j < nyt; j++) {
int ind = i*nyt + j;
T[ind] = 1.0f + exp(-32.f*(pow((float)(j-nyt/2)/(float)nyt,2)+pow((float)(i-nxt/2)/(float)nxt,2)));
//Told[ind] = T[ind];
}
}
float* Tk;
cudaMalloc((void**)&Tk,nxt*nyt*sizeof(float));
cudaMemcpy(Tk,T,nxt*nyt*sizeof(float),cudaMemcpyHostToDevice);
dim3 block(100,100,1);
dim3 thread(32,32,1);
double time = std::clock();
//for (int step=0; step<1000; step++) {
solve<<<block, thread>>>(Tk, dtdx, dtdy, nyt);
//}
//std::cout << 1000.0*(std::clock()-time)/(double) CLOCKS_PER_SEC << " ms" << std::endl;
//time = std::clock();
cudaMemcpy(T,Tk,nxt*nyt*sizeof(float),cudaMemcpyDeviceToHost);
cudaFree(Tk);
//std::cout << "SendMem " << 1000.0*(std::clock()-time)/(double) CLOCKS_PER_SEC << " ms" << std::endl;
float sum = 0.0f;
for (int i = 0; i < nxt*nyt; i++) {
sum += T[i];
}
std::cout << "Sum: " << std::setprecision(10) << sum/(float)(nxt*nyt) << std::endl;
return 0;
}
```

1st question about precision:

The sequential code (main.cpp) gives me a value that we can fairly take as our ‘true’ value (in my case: 1.087123036).

If I execute the CUDA code given above (main.cu), I end up with a small difference of value (1.087129474). Thus, the precision of the computation is not conserved between C++ and C++CUDA…

Does somebody know a bit about precision (float/double) in CUDA?

Is there any optimisation from the compiler regarding floating-point calculations?

2nd question about limit of instructions inside the Kernel:

If I execute the CUDA code on my machine (main.cu), my OS froze for the time of computation and sometimes the system crashes…

If you look at my code, I have put the time loop inside the kernel… I know there is a limit of instructions that you can put in a CUDA kernel but how can I check the number of instructions in my kernel?

If I put the time loop outside of my kernel (as in the main_2.cu given below), the execution works fine but the time of computation is really long… even longer (way longer) than sequential computation (cpp file).

But it seems from all the tutorial I’ve seen so far that the time loop is always outside of the kernel… Thus, the kernel is called every time step while in my first code (main.cu), the code calls one kernel and everything is done inside that kernel.

Using the main_2.cu, the final value is correct (1.087123036), so where is the issue with main.cu?

Could this issue (crash) be due to the limit of instructions inside the kernel?

If the correct solution is main_2.cu, why CUDA is longer than classical sequential computation?

(In my case, main.cpp take 13s, main.cu: 10s (but only 0.1s inside the kernel, the rest is for memory transfer) and main_2.cu: 44s)

main_2.cu

```
// /usr/local/cuda/bin/nvcc --gpu-architecture=sm_30 -O2 main_2.cu && time ./a.out
#include <cuda.h>
#include <iostream>
#include <cmath>
#include <iomanip>
__global__ void solve(float* T, float dtdx, float dtdy, int N)
{
int indX = blockIdx.x * (blockDim.x-4) + threadIdx.x;
int indY = blockIdx.y * (blockDim.y-4) + threadIdx.y;
__shared__ float temp[32*32*6];
temp[threadIdx.x*32+threadIdx.y] = T[indX * N + indY];
__syncthreads();
if(threadIdx.x > 1 && threadIdx.x < (blockDim.x-2) && threadIdx.y > 1 && threadIdx.y < (blockDim.y-2))
T[indX * N + indY] = temp[(threadIdx.x)*32+(threadIdx.y)] + dtdx * (-temp[(threadIdx.x-2)*32+(threadIdx.y)]+16.f*temp[(threadIdx.x-1)*32+(threadIdx.y)]-30.f*temp[(threadIdx.x)*32+(threadIdx.y)]+16.f*temp[(threadIdx.x+1)*32+(threadIdx.y)]-temp[(threadIdx.x+2)*32+(threadIdx.y)])
+ dtdy * (-temp[(threadIdx.x)*32+(threadIdx.y-2)]+16.f*temp[(threadIdx.x)*32+(threadIdx.y-1)]-30.f*temp[(threadIdx.x)*32+(threadIdx.y)]+16.f*temp[(threadIdx.x)*32+(threadIdx.y+1)]-temp[(threadIdx.x)*32+(threadIdx.y+2)]);
}
int main()
{
const int nx = 2800;
const int ny = 2800;
const float lx = 1.f;
const float ly = 1.f;
float dx = lx / (float)nx;
float dy = ly / (float)ny;
const float lambda = 0.001f;
float dt = 0.01f/lambda/(1.0f/(dx*dx)/12.f+1.0f/(dy*dy)/12.f);
float dtdx = dt*lambda/dx/dx/12.f;
float dtdy = dt*lambda/dy/dy/12.f;
int nxt = nx + 4;
int nyt = ny + 4;
float* T = new float[nxt*nyt];
//float* Told = new float[nxt*nyt];
for (int i = 0; i < nxt; i++) {
for (int j = 0; j < nyt; j++) {
int ind = i*nyt + j;
T[ind] = 1.0f + exp(-32.f*(pow((float)(j-nyt/2)/(float)nyt,2)+pow((float)(i-nxt/2)/(float)nxt,2)));
//Told[ind] = T[ind];
}
}
float* Tk;
cudaMalloc((void**)&Tk,nxt*nyt*sizeof(float));
cudaMemcpy(Tk,T,nxt*nyt*sizeof(float),cudaMemcpyHostToDevice);
dim3 block(100,100,1);
dim3 thread(32,32,1);
double time = std::clock();
for (int step=0; step<1000; step++) {
solve<<<block, thread>>>(Tk, dtdx, dtdy, nyt);
}
//std::cout << 1000.0*(std::clock()-time)/(double) CLOCKS_PER_SEC << " ms" << std::endl;
//time = std::clock();
cudaMemcpy(T,Tk,nxt*nyt*sizeof(float),cudaMemcpyDeviceToHost);
cudaFree(Tk);
//std::cout << "SendMem " << 1000.0*(std::clock()-time)/(double) CLOCKS_PER_SEC << " ms" << std::endl;
float sum = 0.0f;
for (int i = 0; i < nxt*nyt; i++) {
sum += T[i];
}
std::cout << "Sum: " << std::setprecision(10) << sum/(float)(nxt*nyt) << std::endl;
return 0;
}
```

Thank you all!