Reduction clause

Hi there.

I’ve been testing out a small program in OpenACC and it seems I have a small problem with it.

I wrote this small program:

#define M_X 10
#define M_Y 10

#include <stdio.h>
#include <stdlib.h>
//#include <openacc.h>

float matrix[M_Y][M_X];

void printMatrix(int height, int lenght)
{
    int i,j;
    
    for(i=0; i<height; i++)
    {
        printf("%d> ", i);
        for(j=0; j<lenght; j++)
        {
            printf("%f ", matrix[i][j]);
        }
        printf("\n");
    }
    
    return;
}

int main(int argc, char* argv[])
{
    float scaler[M_Y], result=0.0;
    int i, j;
    printf("Hello world\n");
    
    // create matrix
    for(i=0; i<M_Y; i++)
        for(j=0; j<M_X; j++)
            matrix[i][j]=(float)(i*j)/10.0;
    
    for(i=0; i<M_Y; i++)
    {
        scaler[i]=0.0;
        for(j=0; j<M_X; j++)
        {
            if(matrix[i][j]>scaler[i])
                scaler[i]=matrix[i][j];
        }
        
        for(j=0; j<M_X; j++)
        {
            if(scaler[i] !=0)
                matrix[i][j] /= scaler[i];
        }
    }
    
    
    
    printf("Matrix:\n");
    printMatrix(M_Y, M_X);
    printf("\nscaler:\n");
    for (i=0; i<M_Y; i++)
        printf("%f ", scaler[i]);
    printf("\n");
    
    for(i=0; i<M_Y; i++)
    {
        scaler[i] *= 0.25;
        result+=scaler[i];
    }
    
    printf("result -> %f\n", result);
    
    
    
    exit(0);
}

Which gives me the following output:

Hello world
Matrix:
0> 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
1> 0.000000 0.111111 0.222222 0.333333 0.444444 0.555556 0.666667 0.777778 0.888889 1.000000
2> 0.000000 0.111111 0.222222 0.333333 0.444444 0.555556 0.666667 0.777778 0.888889 1.000000
3> 0.000000 0.111111 0.222222 0.333333 0.444444 0.555556 0.666667 0.777778 0.888889 1.000000
4> 0.000000 0.111111 0.222222 0.333333 0.444444 0.555556 0.666667 0.777778 0.888889 1.000000
5> 0.000000 0.111111 0.222222 0.333333 0.444444 0.555556 0.666667 0.777778 0.888889 1.000000
6> 0.000000 0.111111 0.222222 0.333333 0.444444 0.555556 0.666667 0.777778 0.888889 1.000000
7> 0.000000 0.111111 0.222222 0.333333 0.444444 0.555556 0.666667 0.777778 0.888889 1.000000
8> 0.000000 0.111111 0.222222 0.333333 0.444444 0.555556 0.666667 0.777778 0.888889 1.000000
9> 0.000000 0.111111 0.222222 0.333333 0.444444 0.555556 0.666667 0.777778 0.888889 1.000000

scaler:
0.000000 0.900000 1.800000 2.700000 3.600000 4.500000 5.400000 6.300000 7.200000 8.100000
result → 10.125000

And then I tried to parallelize it using OpenACC, and I changed the code to the following:

#define M_X 10
#define M_Y 10

#include <stdio.h>
#include <stdlib.h>
#include <openacc.h>

float matrix[M_Y][M_X];

void printMatrix(int height, int lenght)
{
    int i,j;
    
    for(i=0; i<height; i++)
    {
        printf("%d> ", i);
        for(j=0; j<lenght; j++)
        {
            printf("%f ", matrix[i][j]);
        }
        printf("\n");
    }
    
    return;
}

int main(int argc, char* argv[])
{
    float scaler[M_Y], aux_scaler, result=0.0;
    int i, j;
    printf("Hello world\n");
    
    // create matrix
    for(i=0; i<M_Y; i++)
        for(j=0; j<M_X; j++)
            matrix[i][j]=(float)(i*j)/10.0;
    
#pragma acc data copy(matrix), copyout(scaler[M_Y]), create(aux_scaler)
{
    #pragma acc kernels loop private(aux_scaler)
    for(i=0; i<M_Y; i++)
    {
        aux_scaler=0.0;
        #pragma acc loop reduction(max : aux_scaler)
        for(j=0; j<M_X; j++)
        {
            if(matrix[i][j]>aux_scaler)
                aux_scaler=matrix[i][j];
        }
        scaler[i]=aux_scaler;
    }
    
    #pragma acc kernels loop independent
    for(i=0; i<M_Y; i++)
    {
        #pragma acc loop independent
        for(j=0; j<M_X; j++)
        {
            if(scaler[i] !=0)
                matrix[i][j] /= scaler[i];
        }
    }
}
    
    
    printf("Matrix:\n");
    printMatrix(M_Y, M_X);
    printf("\nscaler:\n");
    for (i=0; i<M_Y; i++)
        printf("%f ", scaler[i]);
    printf("\n");
    
    #pragma acc data copyin(scaler[M_Y]) copy(result)
    {
    #pragma acc kernels loop reduction(+:result)
    for(i=0; i<M_Y; i++)
    {
        scaler[i] *= 0.25;
        result+=scaler[i];
    }
    }
    
    printf("result -> %f\n", result);
    
    
    
    exit(0);
}

But now I get this output:

Hello world
Matrix:
0> 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
1> 0.000000 0.111111 0.222222 0.333333 0.444444 0.555556 0.666667 0.777778 0.888889 1.000000
2> 0.000000 0.111111 0.222222 0.333333 0.444444 0.555556 0.666667 0.777778 0.888889 1.000000
3> 0.000000 0.111111 0.222222 0.333333 0.444444 0.555556 0.666667 0.777778 0.888889 1.000000
4> 0.000000 0.111111 0.222222 0.333333 0.444444 0.555556 0.666667 0.777778 0.888889 1.000000
5> 0.000000 0.111111 0.222222 0.333333 0.444444 0.555556 0.666667 0.777778 0.888889 1.000000
6> 0.000000 0.111111 0.222222 0.333333 0.444444 0.555556 0.666667 0.777778 0.888889 1.000000
7> 0.000000 0.111111 0.222222 0.333333 0.444444 0.555556 0.666667 0.777778 0.888889 1.000000
8> 0.000000 0.111111 0.222222 0.333333 0.444444 0.555556 0.666667 0.777778 0.888889 1.000000
9> 0.000000 0.111111 0.222222 0.333333 0.444444 0.555556 0.666667 0.777778 0.888889 1.000000

scaler:
0.000000 0.900000 1.800000 2.700000 3.600000 4.500000 5.400000 6.300000 7.200000 8.100000
result → 0.000000

It seems that everything is fine until the reduction clause I use at the end. Either the result is not being computed or the value is not being transfered back to the CPU.

What am I doing wrong with the reduction clause?

Thanks in advance!

Hi JPMN,

Remove the data clauses that contain your reduction variables. We’ve seen users do this, especially those that started with the Cray compiler, so are looking at adding support for this style of syntax. Though for now, what’s happening is that reduction variables are treated differently and putting them in copy clauses interferes with how the compiler is generating the reduction. Essentially, you are overwriting the “result” when copying it back to the host.

Here’s the modified code:

 % cat red.c
#define M_X 10
#define M_Y 10

#include <stdio.h>
#include <stdlib.h>
#include <openacc.h>

float matrix[M_Y][M_X];

void printMatrix(int height, int lenght)
{
    int i,j;

    for(i=0; i<height; i++)
    {
        printf("%d> ", i);
        for(j=0; j<lenght; j++)
        {
            printf("%f ", matrix[i][j]);
        }
        printf("\n");
    }

    return;
}

int main(int argc, char* argv[])
{
    float scaler[M_Y], aux_scaler, result=0.0;
    int i, j;
    printf("Hello world\n");

    // create matrix
    for(i=0; i<M_Y; i++)
        for(j=0; j<M_X; j++)
            matrix[i][j]=(float)(i*j)/10.0;

#pragma acc data copy(matrix), copyout(scaler[M_Y])
{
    #pragma acc kernels loop
    for(i=0; i<M_Y; i++)
    {
        aux_scaler=0.0;
        #pragma acc loop reduction(max : aux_scaler)
        for(j=0; j<M_X; j++)
        {
            if(matrix[i][j]>aux_scaler)
                aux_scaler=matrix[i][j];
        }
        scaler[i]=aux_scaler;
    }

    #pragma acc kernels loop independent
    for(i=0; i<M_Y; i++)
    {
        #pragma acc loop independent
        for(j=0; j<M_X; j++)
        {
            if(scaler[i] !=0)
                matrix[i][j] /= scaler[i];
        }
    }
}


    printf("Matrix:\n");
    printMatrix(M_Y, M_X);
    printf("\nscaler:\n");
    for (i=0; i<M_Y; i++)
        printf("%f ", scaler[i]);
    printf("\n");

    #pragma acc data copyin(scaler[M_Y])
    {
    #pragma acc kernels loop reduction(+:result)
    for(i=0; i<M_Y; i++)
    {
        scaler[i] *= 0.25;
        result+=scaler[i];
    }
    }

    printf("result -> %f\n", result);



    exit(0);
}

% pgcc -acc -Minfo=accel red.c -V13.4 ; a.out
main:
     38, Generating copyout(scaler[0:])
         Generating copy(matrix[0:][0:])
     40, Generating present_or_copyout(scaler[0:])
         Generating present_or_copy(matrix[0:][0:])
         Generating NVIDIA code
         Generating compute capability 1.0 binary
         Generating compute capability 2.0 binary
         Generating compute capability 3.0 binary
     41, Loop is parallelizable
         Accelerator kernel generated
         41, #pragma acc loop gang /* blockIdx.x */
         45, #pragma acc loop vector(32) /* threadIdx.x */
         Loop is parallelizable
     53, Generating present_or_copy(matrix[0:][0:])
         Generating present_or_copyout(scaler[0:])
         Generating NVIDIA code
         Generating compute capability 1.0 binary
         Generating compute capability 2.0 binary
         Generating compute capability 3.0 binary
     54, Loop is parallelizable
     57, Loop is parallelizable
         Accelerator kernel generated
         54, #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */
         57, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */
     73, Generating copyin(scaler[0:])
     75, Generating present_or_copyin(scaler[0:])
         Generating NVIDIA code
         Generating compute capability 1.3 binary
         Generating compute capability 2.0 binary
         Generating compute capability 3.0 binary
     76, Loop is parallelizable
         Accelerator kernel generated
         76, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */
Hello world
Matrix:
0> 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
1> 0.000000 0.111111 0.222222 0.333333 0.444444 0.555556 0.666667 0.777778 0.888889 1.000000
2> 0.000000 0.111111 0.222222 0.333333 0.444444 0.555556 0.666667 0.777778 0.888889 1.000000
3> 0.000000 0.111111 0.222222 0.333333 0.444444 0.555556 0.666667 0.777778 0.888889 1.000000
4> 0.000000 0.111111 0.222222 0.333333 0.444444 0.555556 0.666667 0.777778 0.888889 1.000000
5> 0.000000 0.111111 0.222222 0.333333 0.444444 0.555556 0.666667 0.777778 0.888889 1.000000
6> 0.000000 0.111111 0.222222 0.333333 0.444444 0.555556 0.666667 0.777778 0.888889 1.000000
7> 0.000000 0.111111 0.222222 0.333333 0.444444 0.555556 0.666667 0.777778 0.888889 1.000000
8> 0.000000 0.111111 0.222222 0.333333 0.444444 0.555556 0.666667 0.777778 0.888889 1.000000
9> 0.000000 0.111111 0.222222 0.333333 0.444444 0.555556 0.666667 0.777778 0.888889 1.000000

scaler:
0.000000 0.900000 1.800000 2.700000 3.600000 4.500000 5.400000 6.300000 7.200000 8.100000
result -> 10.125000

Hope this helps,
Mat

Hi Mat.

That was exactly it! Thank you so much, you solved my problem!

This was the second time that I had to work with the reduction clause and the first time I got this error I just made a work around to solve it.

Thank you once again!

JPMN.