Hi everyone. I’m writing a CUDA code to calculate electron density inside a quantum dot, and the part for solving poisson equation is written in both CUDA and C++.

my problem is that when using cuda code the results are quite different but they converge(which I think they shouldn’t because of the error but I’m not sure). I was

wondering if this is normal in compute capability 1.1 devices (GeForce 9500GT)? (MSE is 0.000001 but in a 81*81 grid when the biggest number is about 0.001 it’s

really high and the electron density graphs are different)

here is the code for both gpu and cpu:

GPU:

```
__global__ void CudaPoissonSolve(float* rho, float* out, int numPoints, float delta)
{
int j = threadIdx.x + blockIdx.x * blockDim.x;
int i = threadIdx.y + blockIdx.y * blockDim.y;
int idx = j + i * numPoints;
if( i >= numPoints || j >= numPoints )
return;
float tempout = 0.0f;
for(int k = 0; k < numPoints; k++)
{
float x1 = j * delta;
float y1 = i * delta;
for(int l = 0; l < numPoints; l++)
{
if( i == k && j == l )
tempout += 2.0f * 1.77245385091f * rho[idx] / delta;
else
{
float x2 = l * delta;
float y2 = k * delta;
float dx = x1 - x2, dy = y1 - y2;
int index = j + i * numPoints;
tempout += rho[index] / sqrtf( (dx * dx + dy * dy) );
}
}
}
out[idx] = tempout * delta * delta;
}
```

CPU:

```
void QuantumDot::PoissonSolve(QDGrid& rho, QDGrid& out)
{
for(long i = 0; i < (long)mNumPoints; i++)
for(long j = 0; j < (long)mNumPoints; j++)
{
float x1 = MeshX(i, j), y1 = MeshY(i, j);
for(long k = 0; k < (long)mNumPoints; k++)
for(long l = 0; l < (long)mNumPoints; l++)
{
float x2 = MeshX(k, l), y2 = MeshY(k, l);
float dx = x1 - x2, dy = y1 - y2;
if( i == k && j == l )
out.At(i, j) += 2.0f * 1.77245385091f * rho.At(i, j) / mDelta;
else
out.At(i, j) += rho.At(k, l) / sqrt(dx * dx + dy * dy);
}
out.At(i, j) *= mDelta2; //mDelta2 = mDelta * mDelta
}
}
```