Hi there.
I’ve been testing out a small program in OpenACC and it seems I have a small problem with it.
I wrote this small program:
#define M_X 10
#define M_Y 10
#include <stdio.h>
#include <stdlib.h>
//#include <openacc.h>
float matrix[M_Y][M_X];
void printMatrix(int height, int lenght)
{
int i,j;
for(i=0; i<height; i++)
{
printf("%d> ", i);
for(j=0; j<lenght; j++)
{
printf("%f ", matrix[i][j]);
}
printf("\n");
}
return;
}
int main(int argc, char* argv[])
{
float scaler[M_Y], result=0.0;
int i, j;
printf("Hello world\n");
// create matrix
for(i=0; i<M_Y; i++)
for(j=0; j<M_X; j++)
matrix[i][j]=(float)(i*j)/10.0;
for(i=0; i<M_Y; i++)
{
scaler[i]=0.0;
for(j=0; j<M_X; j++)
{
if(matrix[i][j]>scaler[i])
scaler[i]=matrix[i][j];
}
for(j=0; j<M_X; j++)
{
if(scaler[i] !=0)
matrix[i][j] /= scaler[i];
}
}
printf("Matrix:\n");
printMatrix(M_Y, M_X);
printf("\nscaler:\n");
for (i=0; i<M_Y; i++)
printf("%f ", scaler[i]);
printf("\n");
for(i=0; i<M_Y; i++)
{
scaler[i] *= 0.25;
result+=scaler[i];
}
printf("result -> %f\n", result);
exit(0);
}
Which gives me the following output:
Hello world
Matrix:
0> 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
1> 0.000000 0.111111 0.222222 0.333333 0.444444 0.555556 0.666667 0.777778 0.888889 1.000000
2> 0.000000 0.111111 0.222222 0.333333 0.444444 0.555556 0.666667 0.777778 0.888889 1.000000
3> 0.000000 0.111111 0.222222 0.333333 0.444444 0.555556 0.666667 0.777778 0.888889 1.000000
4> 0.000000 0.111111 0.222222 0.333333 0.444444 0.555556 0.666667 0.777778 0.888889 1.000000
5> 0.000000 0.111111 0.222222 0.333333 0.444444 0.555556 0.666667 0.777778 0.888889 1.000000
6> 0.000000 0.111111 0.222222 0.333333 0.444444 0.555556 0.666667 0.777778 0.888889 1.000000
7> 0.000000 0.111111 0.222222 0.333333 0.444444 0.555556 0.666667 0.777778 0.888889 1.000000
8> 0.000000 0.111111 0.222222 0.333333 0.444444 0.555556 0.666667 0.777778 0.888889 1.000000
9> 0.000000 0.111111 0.222222 0.333333 0.444444 0.555556 0.666667 0.777778 0.888889 1.000000scaler:
0.000000 0.900000 1.800000 2.700000 3.600000 4.500000 5.400000 6.300000 7.200000 8.100000
result → 10.125000
And then I tried to parallelize it using OpenACC, and I changed the code to the following:
#define M_X 10
#define M_Y 10
#include <stdio.h>
#include <stdlib.h>
#include <openacc.h>
float matrix[M_Y][M_X];
void printMatrix(int height, int lenght)
{
int i,j;
for(i=0; i<height; i++)
{
printf("%d> ", i);
for(j=0; j<lenght; j++)
{
printf("%f ", matrix[i][j]);
}
printf("\n");
}
return;
}
int main(int argc, char* argv[])
{
float scaler[M_Y], aux_scaler, result=0.0;
int i, j;
printf("Hello world\n");
// create matrix
for(i=0; i<M_Y; i++)
for(j=0; j<M_X; j++)
matrix[i][j]=(float)(i*j)/10.0;
#pragma acc data copy(matrix), copyout(scaler[M_Y]), create(aux_scaler)
{
#pragma acc kernels loop private(aux_scaler)
for(i=0; i<M_Y; i++)
{
aux_scaler=0.0;
#pragma acc loop reduction(max : aux_scaler)
for(j=0; j<M_X; j++)
{
if(matrix[i][j]>aux_scaler)
aux_scaler=matrix[i][j];
}
scaler[i]=aux_scaler;
}
#pragma acc kernels loop independent
for(i=0; i<M_Y; i++)
{
#pragma acc loop independent
for(j=0; j<M_X; j++)
{
if(scaler[i] !=0)
matrix[i][j] /= scaler[i];
}
}
}
printf("Matrix:\n");
printMatrix(M_Y, M_X);
printf("\nscaler:\n");
for (i=0; i<M_Y; i++)
printf("%f ", scaler[i]);
printf("\n");
#pragma acc data copyin(scaler[M_Y]) copy(result)
{
#pragma acc kernels loop reduction(+:result)
for(i=0; i<M_Y; i++)
{
scaler[i] *= 0.25;
result+=scaler[i];
}
}
printf("result -> %f\n", result);
exit(0);
}
But now I get this output:
Hello world
Matrix:
0> 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
1> 0.000000 0.111111 0.222222 0.333333 0.444444 0.555556 0.666667 0.777778 0.888889 1.000000
2> 0.000000 0.111111 0.222222 0.333333 0.444444 0.555556 0.666667 0.777778 0.888889 1.000000
3> 0.000000 0.111111 0.222222 0.333333 0.444444 0.555556 0.666667 0.777778 0.888889 1.000000
4> 0.000000 0.111111 0.222222 0.333333 0.444444 0.555556 0.666667 0.777778 0.888889 1.000000
5> 0.000000 0.111111 0.222222 0.333333 0.444444 0.555556 0.666667 0.777778 0.888889 1.000000
6> 0.000000 0.111111 0.222222 0.333333 0.444444 0.555556 0.666667 0.777778 0.888889 1.000000
7> 0.000000 0.111111 0.222222 0.333333 0.444444 0.555556 0.666667 0.777778 0.888889 1.000000
8> 0.000000 0.111111 0.222222 0.333333 0.444444 0.555556 0.666667 0.777778 0.888889 1.000000
9> 0.000000 0.111111 0.222222 0.333333 0.444444 0.555556 0.666667 0.777778 0.888889 1.000000scaler:
0.000000 0.900000 1.800000 2.700000 3.600000 4.500000 5.400000 6.300000 7.200000 8.100000
result → 0.000000
It seems that everything is fine until the reduction clause I use at the end. Either the result is not being computed or the value is not being transfered back to the CPU.
What am I doing wrong with the reduction clause?
Thanks in advance!