I am going through an example from gpubootcamp. The code runs extremely slowly. It takes longer to execute on gpu than just serially. What am I doing wrong? The code that I am running is below.
And I am compiling with the following command: nvc -acc -gpu=managed -Minfo=accel
I am using a NVIDIA GeForce RTX 3050 Laptop GPU but running the code in a container.
#include <math.h>
#include <stdlib.h>
#include <string.h>
#define OFFSET(x, y, m) (((x)*(m)) + (y))
void initialize(double *restrict A, double *restrict Anew, int m, int n)
{
memset(A, 0, n * m * sizeof(double));
memset(Anew, 0, n * m * sizeof(double));
for(int i = 0; i < m; i++){
A[i] = 1.0;
Anew[i] = 1.0;
}
}
double calcNext(double *restrict A, double *restrict Anew, int m, int n)
{
double error = 0.0;
#pragma acc parallel loop reduction(max:error)
for( int j = 1; j < n-1; j++)
{
for( int i = 1; i < m-1; i++ )
{
Anew[OFFSET(j, i, m)] = 0.25 * ( A[OFFSET(j, i+1, m)] + A[OFFSET(j, i-1, m)]
+ A[OFFSET(j-1, i, m)] + A[OFFSET(j+1, i, m)]);
error = fmax( error, fabs(Anew[OFFSET(j, i, m)] - A[OFFSET(j, i , m)]));
}
}
return error;
}
void swap(double *restrict A, double *restrict Anew, int m, int n)
{
#pragma acc parallel loop
for( int j = 1; j < n-1; j++)
{
for( int i = 1; i < m-1; i++ )
{
A[OFFSET(j, i, m)] = Anew[OFFSET(j, i, m)];
}
}
}
void deallocate(double *restrict A, double *restrict Anew)
{
free(A);
free(Anew);
}