Hi Mat,
of course, I provide below the two small codes iterativelly solving the Laplace equation. The first one uses static memory (changing directives I noted unexpected behaviors as described above) whereas the second one uses dynamic memory and seems to fail. I hope there is no bug but you can double-check it…
The file tree is:
select_device.h
LAPLACE/laplace_static.c
LAPLACE/laplace_dynamic.c
The files are:
select_device.h
#ifdef _OPENACC
int mygpu, myrealgpu, num_devices;
acc_device_t my_device_type;
#ifdef CAPS
my_device_type = acc_device_cuda;
#elif PGI
my_device_type = acc_device_nvidia;
#endif
if(argc == 1) mygpu = 0; else mygpu = atoi(argv[1]);
acc_set_device_type(my_device_type) ;
num_devices = acc_get_num_devices(my_device_type) ;
fprintf(stderr,"Number of devices available: %d \n ",num_devices);
acc_set_device_num(mygpu,my_device_type);
fprintf(stderr,"Trying to use GPU: %d \n",mygpu);
myrealgpu = acc_get_device_num(my_device_type);
fprintf(stderr,"Actually I am using GPU: %d \n",myrealgpu);
if(mygpu != myrealgpu) {
fprintf(stderr,"I cannot use the requested GPU: %d\n",mygpu);
exit(1);
}
#endif
laplace_static.c
#include <math.h>
#include <string.h>
#include <float.h>
#ifdef _OPENACC
#include <openacc.h>
#endif
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#define NN 4096
#define NM 4096
double A[NN][NM];
double Anew[NN][NM];
int main(int argc, char *argv[]) {
int n = NN;
int m = NM;
int iter_max = 100;
double tol = 1.0e-6;
double error = 1.0;
int iter = 0;
int i,j;
struct timeval temp_1, temp_2;
double ttime=0.;
#include "../select_device.h"
memset(A, 0, n * m * sizeof(double));
memset(Anew, 0, n * m * sizeof(double));
for (j = 0; j < n; j++)
{
A[j][0] = 1.0;
Anew[j][0] = 1.0;
}
printf("Jacobi relaxation Calculation: %d x %d mesh\n", n, m);
gettimeofday(&temp_1, (struct timezone*)0);
while ( error > tol && iter < iter_max )
{
error = 0.0;
#pragma acc parallel loop reduction(max: error)
for( j = 1; j < n-1; j++) {
for( i = 1; i < m-1; i++ ) {
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));
}
}
#pragma acc parallel loop
for( j = 1; j < n-1; j++) {
for( i = 1; i < m-1; i++ ) {
A[j][i] = Anew[j][i];
}
}
iter++;
if(iter % 10 == 0) printf("%5d, %0.8lf\n", iter, error);
}
gettimeofday(&temp_2, (struct timezone*)0);
ttime = 0.000001*((temp_2.tv_sec-temp_1.tv_sec)*1.e6+(temp_2.tv_usec-temp_1 .tv_usec));
printf("Elapsed time (s) = %.2lf\n", ttime);
printf("Stopped at iteration: %u\n", iter);
return 0;
}
laplace_dynamic.c:
#define MAX(A,B) (((A) > (B)) ? (A) : (B))
#define ABS(A) (((A) < (0)) ? (-A): (A))
// solves 2-D Laplace equation using a relaxation scheme
#include <stdio.h>
#include <math.h>
#include <float.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#ifdef _OPENACC
#include<openacc.h>
#endif
int main(int argc, char *argv[]) {
double * T, * Tnew;
double tol, var = DBL_MAX, top = 100.0;
unsigned n, n2, maxIter, i, j, iter = 0;
int itemsread;
FILE *fout;
#include "../select_device.h"
#ifdef FIXED_INPUT
n = 4096 ; maxIter = 100 ; tol = 0.0001;
#else
printf("Enter mesh size, max iterations and tolerance: ");
itemsread = scanf("%u ,%u ,%lf", &n, &maxIter, &tol);
if (itemsread!=3) {
fprintf(stderr, "Input error!\n");
exit(-1);
}
#endif
n2 = n+2;
T = (double*)calloc(n2*n2, sizeof(*T));
Tnew = (double*)calloc(n2*n2, sizeof(*T));
if (T == NULL || Tnew == NULL) {
fprintf(stderr, "Not enough memory!\n");
exit(EXIT_FAILURE);
}
// set boundary conditions
for (i=1; i<=n; i++) {
T[(n+1)*n2+i] = i * top / (n+1);
T[i*n2+n+1] = i * top / (n+1);
}
for(i = 0; i<=n+1; i++)
for(j = 0; j<=n+1; j++)
Tnew[i*n2+j] = T[i*n2+j] ;
time_t startTime = clock();
#pragma acc data copy(T[0:n2*n2]), create(Tnew[0:n2*n2])
{
while(var > tol && iter <= maxIter) {
++iter;
var = 0.0;
#pragma acc parallel loop collapse(2) reduction(max:var)
for (i=1; i<=n; ++i) {
// uncomment for PGI and disable collapse above
//#pragma acc loop seq
for (j=1; j<=n; ++j) {
Tnew[i*n2+j] = 0.25*( T[(i-1)*n2+j] + T[(i+1)*n2+j]
+ T[i*n2+(j-1)] + T[i*n2+(j+1)] );
#ifdef _OPENACC
var = fmax(var, fabs(Tnew[i*n2+j] - T[i*n2+j]));
#else
var = MAX(var, ABS(Tnew[i*n2+j] - T[i*n2+j]));
#endif
}
}
#pragma acc parallel loop collapse(2)
for (i=1; i<=n; ++i) {
for (j=1; j<=n; ++j) {
T[i*n2+j] = Tnew[i*n2+j] ;
}
}
if (iter%10 == 0)
printf("iter: %8u, variation = %12.4lE\n", iter, var);
}
}
double endTime = (clock() - startTime) / (double) CLOCKS_PER_SEC;
printf("Elapsed time (s) = %.2lf\n", endTime);
printf("Mesh size: %u\n", n);
printf("Stopped at iteration: %u\n", iter);
printf("Maximum error: %lE\n", var);
#ifdef PRINT_OUTPUT
// saving results to file
fout = fopen("results", "w");
if (fout == NULL) {
perror("results");
exit(-1);
}
for (i=1; i<=n; ++i)
for (j=1; j<=n; ++j)
fprintf(fout, "%8u %8u %18.9lE\n", i, j, T[i*n2+j]);
fclose(fout);
#endif
free(T);
free(Tnew);
return 0;
}
To compile I use:
pgcc -acc -ta=nvidia,time,cuda5.0,cc35 -Minfo=accel -O3 -DPGI -DFIXED_INPUT <file_name.c>
I hope the cut and paste was correct. Check if it compiles.
many thanks
Francesco