OpenACC data copy with AMD FirePro W9100

HI,

I made an OpenACC code where most of variables are double type.

Before, I used NVIDIA Geforce GPU. But, in order to reduce the calculation time, I bought AMD FirePro W9100.

The code compiled without problem with compiler option -ta=radeon:hawaii.
(I’m using PGI Community Edition Version 16.10. After installing AMD driver, I reinstalled PGI Community Edition.)

However, when I run it, the data clause does not work even though pgaccelinfo says that I have AMD FIrePro W9100.
(In the below sample code, in the output file, x and y are arbitrary numbers. Even if I use -ta=radeon option, the problem is not solved.)

Compiling the same code with compiler option -ta=multicore or -ta= tesla:cuda8.0 (when I used Geforce GPU which is compatible with cuda8.0) does not bring any problem and data clause works perfectly fine.

Could you give me some help so that the below code works with AMD GPU?
If OpenACC is not compatible with AMD GPU, should I have to use OpenCL?
(If so, is there any simple way to convert OpenACC code to OpenCL code? I’m afraid that it would take too much time to run OpenCL)

Thanks in advance.

Here are my test code. It consists of main code file, header file. If one compiles with “pgcc -acc -ta=radeon:hawaii -Minfo=accel amdtest.c -o amdtest” and the input file is “testinput”, then one should type “./amdtest -p testinput” in order to execute the program.

#include "test.h"

int main(int argc, char **argv) {
	FILE *out;
	int optnum, optcheck;
	long cnti, cntj;

	double *restrict x;
	double *restrict y;

	if ((argc < 3) || (strcmp(*(argv + 1), "-p") != 0)) {
		fprintf(stderr, "Usage: %s -p <parameterfile> \n", *argv);
		exit(EXIT_FAILURE);
	}

	optnum = argc - 1;
	printf("Parameter numbers = %d\n", optnum - 1);

	for(optcheck = 2; optcheck <= optnum; optcheck++)
	{
		if (!cfg_init(argv[optcheck])) {
			fprintf(stderr, "Wrong input parameter file.\n");
			exit(EXIT_FAILURE);
		}

		readpar(argv[optcheck]);

		x = alloc_double_vector(Nx);
		y = alloc_double_vector(Ny);

		#pragma acc data copyout(x[:Nx], y[:Ny]) pcopyin(Nx, Ny, dx, dy, Nx2, Ny2)
		{
			#pragma acc kernels loop
			for (cnti = 0; cnti < Nx; cnti++) {
				x[cnti] = (cnti - Nx2) * dx;
			}

			#pragma acc kernels loop
			for (cntj = 0; cntj < Ny; cntj++) {
				y[cntj] = (cntj - Ny2) * dy;
			}
		}

		if (outname != NULL) {
			out = fopen(outname, "w");
			for (cnti = 0; cnti < Nx; cnti += outstpx) {
				for (cntj = 0; cntj < Ny; cntj += outstpy) {
					fprintf(out, "%e %e\n", x[cnti], y[cntj]);
				}
			}
			fclose(out);
		}

		free_double_vector(x);
		free_double_vector(y);
	}

	return(EXIT_SUCCESS);
}

int cfg_init(char *cfg_file) {
    FILE *file;
    char buf[256];

    file = fopen(cfg_file, "r");
    if (!file) return 0;

    cfg_size = 0;
    while (fgets(buf, 256, file) != NULL) {
        if (sscanf(buf, "%s = %s", cfg_key[cfg_size], cfg_val[cfg_size]) == 2) cfg_size++;
    }

    fclose(file);
    return cfg_size;
}

char *cfg_read(char *key) {
    int i;

    for (i = 0; i < cfg_size; i++)
        if (!strcmp(key, cfg_key[i])) return cfg_val[i];

    return NULL;
}

double *alloc_double_vector(long Nx1) {
    double *vector;

    if ((vector = (double *)malloc((size_t)(Nx1 * sizeof(double)))) == NULL) {
        fprintf(stderr, "Failed to allocate memory for the vector.\n");
        exit(EXIT_FAILURE);
    }

    return vector;
}

void free_double_vector(double *vector) {
    free((char *)vector);
}

void readpar(char *cfg_tmp) {

    if ((cfg_tmp = cfg_read("NX")) == NULL) {
        fprintf(stderr, "NX is not defined in the configuration file.\n");
        exit(EXIT_FAILURE);
    }
    Nx = atol(cfg_tmp);

    if(Nx % 2 == 0) {
	Nx = Nx + 1;
	printf("Nx is changed.\n");
    }

    Nx2 = Nx / 2;

    if ((cfg_tmp = cfg_read("NY")) == NULL) {
        fprintf(stderr, "NY is not defined in the configuration file.\n");
        exit(EXIT_FAILURE);
    }
    Ny = atol(cfg_tmp);

    if(Ny % 2 == 0) {
	Ny = Ny + 1;
	printf("Ny is changed.\n");
    } 

    Ny2 = Ny / 2;

    if ((cfg_tmp = cfg_read("DX")) == NULL) {
        fprintf(stderr, "DX is not defined in the configuration file.\n");
        exit(EXIT_FAILURE);
    }
    dx = atof(cfg_tmp);

    if ((cfg_tmp = cfg_read("DY")) == NULL) {
        fprintf(stderr, "DY is not defined in the configuration file.\n");
        exit(EXIT_FAILURE);
    }
    dy = atof(cfg_tmp);

    outname = cfg_read("OUTNAME");

    if ((outname != NULL)) {
        if ((cfg_tmp = cfg_read("OUTSTPX")) == NULL) {
            fprintf(stderr, "OUTSTPX is not defined in the configuration file.\n");
            exit(EXIT_FAILURE);
        }
        outstpx = atol(cfg_tmp);

        if ((cfg_tmp = cfg_read("OUTSTPY")) == NULL) {
            fprintf(stderr, "OUTSTPY is not defined in the configuration file.\n");
            exit(EXIT_FAILURE);
        }
        outstpy = atol(cfg_tmp);
    }

    return;
}
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>

long Nx, Ny;
long Nx2, Ny2;

double dx, dy;

char *outname;

long outstpx, outstpy;

void readpar(char *);

double *alloc_double_vector(long);
void free_double_vector(double *);

int cfg_init(char *);
char *cfg_read(char *);
int cfg_size;
char cfg_key[256][256], cfg_val[256][256];
# Number of discretization points in the x axis.
# Type: long
  NX = 101

# Number of discretization points in the y axis.
# Type: long
  NY = 101

# Spatial discretization step in the x axis.
# Type: double
  DX = 0.01

# Spatial discretization step in the y axis.
# Type: double
  DY = 0.01

# Output file
# Type: string
  OUTNAME = testoutput.txt

# Discretization step in the x-direction used to save x.
# Type: long
  OUTSTPX = 1

# Discretization step in the y-direction used to save y.
# Type: long
  OUTSTPY = 1

Hi sshin39285,

When you say that “data clause does not work”, what do you mean? That the code ran but you don’t get correct answers? Are you getting a runtime error?

I tried your code on my Radeon device and got the same output as compiling without OpenACC. Granted, my device is a 7660 not a W9100, but that shouldn’t matter.

If OpenACC is not compatible with AMD GPU, should I have to use OpenCL?

While PGI did support some Radeon devices in PGI 16.10, for a variety of business and technical reasons, support for Radeon GPUs was removed from the PGI compilers on both Linux and Windows platforms starting with the PGI 17.1 release. So it’s not that OpenACC, which is an open standard implemented by multiple compilers, doesn’t support AMD, it’s that PGI has removed support for AMD as of the 17.1.

If so, is there any simple way to convert OpenACC code to OpenCL code? I’m afraid that it would take too much time to run OpenCL

I did a web search for the term “openacc to opencl translator” and found this article from 2014 http://www.techenablement.com/ipmacc-open-source-openacc-cudaopencl-translator/ which talks about an open source OpenACC to CUDA/OpenCL translator. I’ve never used it, so don’t know it’s quality or even if it’s still maintained. There’s also a long list of limitations so may not be suitable for your code. Though, you’re welcome to give it a try.

-Mat

HI mkcolg,

Thank you for your reply, but I would like to ask some more:

When you say that “data clause does not work”, what do you mean? That the code ran but you don’t get correct answers? Are you getting a runtime error?

I tried your code on my Radeon device and got the same output as compiling without OpenACC. Granted, my device is a 7660 not a W9100, but that shouldn’t matter.

When I ran the code with W9100 for NX = NY = 3 in the input file(“testinput”), the output file “testoutput.txt” gives following results:

2.148644e-314 -3.985030e+304
2.148644e-314 -3.985030e+304
2.148644e-314 -6.327476e+304
-3.985030e+304 -3.985030e+304
-3.985030e+304 -3.985030e+304
-3.985030e+304 -6.327476e+304
-6.327476e+304 -3.985030e+304
-6.327476e+304 -3.985030e+304
-6.327476e+304 -6.327476e+304

SInce DX = DY = 0.01 in the “testinput”, dx = dy = 0.01. Also, as I changed NX = NY = 3 in the “testinput” file, Nx = Ny = 3 and Nx2 = Ny2 = 1. So, correct results should be

-1.000000e-02 -1.000000e-02
-1.000000e-02 0.000000e+00
-1.000000e-02 1.000000e-02
0.000000e+00 -1.000000e-02
0.000000e+00 0.000000e+00
0.000000e+00 1.000000e-02
1.000000e-02 -1.000000e-02
1.000000e-02 0.000000e+00
1.000000e-02 1.000000e-02

which I got for -ta=multicore option. In this sense, I thought that data clause does not work for W9100. However, since you checked that my test code gives correct result with Radeon 7660, I don’t know why W9100 gives strange results.
(https://forums.developer.nvidia.com/t/ta-radeon-hawaii-not-supported-anymore/134962/1 says that OpenACC code runs successfully with W9100. But, for me, it gives strange results as I said above, even if I use -ta=radeon option.)

While PGI did support some Radeon devices in PGI 16.10, for a variety of business and technical reasons, support for Radeon GPUs was removed from the PGI compilers on both Linux and Windows platforms starting with the PGI 17.1 release. So it’s not that OpenACC, which is an open standard implemented by multiple compilers, doesn’t support AMD, it’s that PGI has removed support for AMD as of the 17.1.

I saw that there is a way to compile OpenACC code with GCC compiler(http://scelementary.com/2015/04/25/openacc-in-gcc.html). But it is not for AMD…

Would it be OK if I ask which compiler might support OpenACC code with W9100? If PGI compiler supports OpenACC code with W9100, please let me know.

I did a web search for the term “openacc to opencl translator” and found this article from 2014 > http://www.techenablement.com/ipmacc-open-source-openacc-cudaopencl-translator/ > which talks about an open source OpenACC to CUDA/OpenCL translator.

Thank you for your suggestion. I will try that and see if it works with W9100.

P.S. To provide further information, I post here the result of pgaccelinfo:

OpenCL Platform:               AMD Accelerated Parallel Processing
OpenCL Vendor:                 Advanced Micro Devices, Inc.

Device Number:                 0
Device Name:                   Hawaii
Available:                     Yes
Compiler Available:            Yes
Board Name:                    AMD Radeon FirePro W9100
Device Version:                OpenCL 1.2 AMD-APP (2264.10)
Global Memory Size:            16978542592
Maximum Object Size:           4244635648
Global Cache Size:             16384
Free Memory:                   33089688000
Max Clock (MHz):               930
Compute Units:                 44
SIMD Units:                    4
SIMD Width:                    16
GPU Cores:                     2816
Wavefront Width:               64
Constant Memory Size:          4244635648
Local Memory Size:             32768
Workgroup Size:                256
Address Bits:                  64
ECC Support:                   No
PGI Compiler Option:           -ta=radeon:hawaii

Hi sshin39285,

The answers I get on all targets including Radeon are the same as your multicore results. Unfortunately without a way to recreate the issue, I’m unable to determine the cause. PGI 16.10 does support Hawaii based Radeon devices but it’s unclear why you’re getting wrong answers on your device.

One thing we can try is for you to set the environment variable “PGI_ACC_DEBUG=1” and log the output. This will show all the OpenACC runtime calls made by the program and might give us some clues as to the issue. However, the output can be quite large so too big to post. Please send the log to PGI Customer Service (trs@pgroup.com) and ask them to forward it to me.

-Mat

Hi Mat,

I had some problem on my computer. After reinstalling Ubuntu 16.04, AMD driver, PGI Community Edition 16.10, I just send the log file to PGI Customer Service(trs@pgroup.com) and asked them to forward it to you.
(Unfortunately, still my OpenACC test code gives ridiculous results for FirePro W9100 with -ta=radeon or -ta=radeon:hawaii. But it gives correct results for intel i7 with -ta=multicore option.)

NX = NY = 3 and DX = DY = 0.01 in the “testinput” file as I said previously.

Thank you for your help.

Shin

P.S. I used following commands to get log file:

  1. export PGI_ACC_DEBUG=1
  2. pgcc -acc -ta=radeon:hawaii -Minfo=accel amdtest.c -o amdtest
  3. ./amdtest -p testinput
  4. Copy and paste words to text file.

Sorry to bother again, but I have another problem…

I formatted hard disk, installed Ubuntu 14.04.4 and installed AMD catalyst driver(fglrx-15.302.2301. http://support.amd.com/en-us/download/workstation?os=Linux+x86_64#pro-driver).

After that, I installed PGI Community Edition 16.10, typed “sudo vi ~/.profile” to the terminal, and added 4 lines:

export PGI=/opt/pgi;
export PATH=/opt/pgi/linux86-64/16.10/bin:$PATH;
export MANPATH=$MANPATH:/opt/pgi/linux86-64/16.10/man;
export LM_LICENSE_FILE=$LM_LICENSE_FILE:/opt/pgi/license.dat;

After reboot, when I typed “pgaccelinfo”, it said that libOpenCL.so cannot be found. So, I installed AMD APP SDK v3.0(http://developer.amd.com/tools-and-sdks/opencl-zone/amd-accelerated-parallel-processing-app-sdk/), typed “sudo vi ~/.profile”, and added 1 more line:

export LD_LIBRARY_PATH=/opt/AMDAPPSDK-3.0/lib/x86_64;

After reboot, when I typed “pgaccelinfo”, I got following result:

OpenCL Platform: AMD Accelerated Parallel Processing
OpenCL Vendor: Advanced Micro Devices, Inc.

Device Number: 0
Device Name: Hawaii
Available: Yes
Compiler Available: Yes
Board Name: AMD FirePro W9100
Device Version: OpenCL 2.0 AMD-APP (1912.5)
Global Memory Size: 34039939392
Maximum Object Size: 25390694400
Global Cache Size: 16384
Free Memory: 33218384000
Max Clock (MHz): 930
Compute Units: 44
SIMD Units: 4
SIMD Width: 16
GPU Cores: 2816
Wavefront Width: 64
Constant Memory Size: 65536
Local Memory Size: 32768
Workgroup Size: 256
Address Bits: 64
ECC Support: No
PGI Compiler Option: -ta=radeon:hawaii

Now, when I compiled “pgcc -acc -ta=radeon:hawaii -Minfo=accel amdtest.c -o amdtest” and executed the program with “./amdtest -p testinput”, it says “Segmentation fault (core dumped)”.

Using PGI_ACC_DEBUG=1, I got following short lines:

ACC: detected 1 OpenCL platforms
ACC: OpenCL platform[0] has vendor Advanced Micro Devices, Inc.
ACC: detected 1 AMD OpenCL device(s)
ACC: device[1] is AMD OpenCL device 0: Hawaii
ACC: initialized 1 AMD OpenCL device(s)
ACC: device[2] is PGI native
ACC: device[0] is PGI native
pinitialize for thread 1
curr_devid for thread 1 is 1
pgi_uacc_dataenterstart( file=/media/mount/openacc/simpletest/amdtest.c, function=main, line=3:59, line=31, devid=0 )
pgi_uacc_dataon(hostptr=0x22bc150,stride=1,size=3,extent=-1,eltsize=8,lineno=31,name=x,flags=0xb00=present+create+copyout,async=-1,threadid=1)
pgi_uacc_alloc(size=24,devid=1,threadid=1)
Segmentation fault (core dumped)

Did I made mistakes when installing PGI compiler? (AMD catalyst driver seems to work well in the sense that there is no problem about display, no infinite login screen problem, and “fglrxinfo” gives correct GPU product name. So I don’t think I made mistakes when installing that driver…)

To be sure, I compiled the same code with option “-ta=multicore”. Then the program runs without problem and results are correct.(in the “testinput” file, NX = NY = 3 for “-ta=radeon:hawaii” and “-ta=multicore” cases)

Thank you for your help.

Regards,

Shin