Hi

I need to do the porting from ANSI C to CUDA of a finite difference function. I have no idea why I am not even able to run the same code without parallelization. Can anyone help me? in some case I experienced that the result of the finite difference was zero (something like sum of (0.5 * 0.9) = 0.0) which does not make any sense.

This is the code of the C function which is perfectly working:

```
void DiffX_CPU(float* U, float* Ux, int N, int alpha, float* stencils, int rank)
// CPU routine to compute spatial derivative in one space dimension.
{
// index in output vector Ux to be computed
int row;
for (row=0; row<N; ++row)
{
float value=0.0;
// Compute dot-product between FDM stencil weights and input vector U
int diff = 0; // diff is used for automatically taking one-sided difference near boundaries
if (row<alpha)
diff = alpha - row;
else if (row>N-1-alpha) // row > Nx-3 Nx-2 Nx-1
diff = N-1-alpha-row;
int tmp = (alpha-diff)*rank+alpha;
int tmp2 = row + diff;
int i;
for (i = -alpha; i<alpha+1; ++i)
value += U[tmp2+i]*stencils[tmp+i];
// Store computed approximation
Ux[row] = value;
}
}
```

And this is the code without the parallalization in CUDA:

```
__global__ void DiffX_GPU(float* d_U, float* d_Ux, int N, int alpha, float* d_stencils, int rank)
{
// indices
// const int b_i = threadIdx.x;
// const int b_j = blockIdx.y*blockDim.y + threadIdx.y;
// const int n = b_i * N + b_j;
int row;
for (row=0; row<N; ++row)
{
float value=0.0;
// Compute dot-product between FDM stencil weights and input vector U
int diff = 0; // diff is used for automatically taking one-sided difference near boundaries
if (row<alpha)
diff = alpha - row;
else if (row>N-1-alpha) // row > Nx-3 Nx-2 Nx-1
diff = N-1-alpha-row;
int tmp = (alpha-diff)*rank+alpha;
int tmp2 = row + diff;
int i;
for (i = -alpha; i<alpha+1; ++i)
value += (d_U[tmp2+i]*d_stencils[tmp+i]) ;
// Store computed approximation
d_Ux[row] = value;
}
}
int main......
...
// Allocate space on device
float *d_U, *d_Ux, *d_stencils;
cudaMalloc ((void**) &d_U, Nx*sizeof(float)); // TODO: Error checking
cudaMalloc ((void**) &d_Ux, Nx*sizeof(float)); // TODO: Error checking
cudaMalloc ((void**) &d_stencils, rank*sizeof(float)); // TODO: Error checking
// Copy data to device
cudaMemcpy (d_U, U, Nx*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy (d_stencils, stencils, rank*sizeof(float), cudaMemcpyHostToDevice);
// Blocks and grid galore
dim3 dimThreadBlock (BLOCK_SIZE_X, BLOCK_SIZE_Y);
dim3 dimBlockGrid (Nx/BLOCK_SIZE_X,1);
DiffX_GPU <<< dimBlockGrid, dimThreadBlock >>> (d_U, d_Ux, Nx, alpha, d_stencils, rank) ;
cudaThreadSynchronize();
// Copy result to host
cudaMemcpy (Ux, d_Ux, Nx*sizeof(float), cudaMemcpyDeviceToHost);
...
```