Hi, I am using 10.9 but unable to generate a kernel for the following code.
#pragma acc region copyout(grid[0:pixelsX*pixelsY-1])
{
for(pixelY=0; pixelY<pixelsY; pixelY++) {
for(pixelX=0; pixelX<pixelsX; pixelX++) {
y = maxY - pixelY*spacing;
x = minX + pixelX*spacing;
iterations = 0;
x0 = x;
y0 = y;
while(x*x + y*y <= 4 && iterations < maxIterations) {
xtmp = x*x - y*y + x0;
y = 2*x*y + y0;
x = xtmp;
iterations++;
}
//Write number of iterations required to array
grid[pixelY*pixelsX + pixelX] = iterations;
}
}
}
When I try and compile I get the following:
39, Accelerator region ignored
50, Accelerator restriction: induction variable live-out from loop: iterations
Accelerator restriction: loop has multiple exits
Accelerator restriction: loop contains induction variable live after the loop
55, Accelerator restriction: induction variable live-out from loop: iterations
I want everything inside of the 2 outermost for loops to be the kernel but the compiler doesn’t seem to like that. I have tried using #pragma acc for kernel but that hasn’t helped either. Can anyone shed some light on how to get this kernel generated? Thanks!
the entire code is located below:
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
int main(int argc, char *argv[])
{
int pixelX, pixelY;
//Dimensions of grid
const double minX = -1.8;
const double maxX = 0.7;
const double minY = -1.2;
const double maxY = 1.2;
//spacing between grid points
const double spacing = 0.005;
//Number of grid points in x and y direction
const int pixelsX = (int)ceil(fabs(maxX-minX)/spacing);
const int pixelsY = (int)ceil(fabs(maxY-minY)/spacing);
printf("%d x %d grid size\n", pixelsX, pixelsY);
size_t bytes = pixelsX*pixelsY*sizeof(int);
//Allocate grid to hold number of itereations for escape
int *grid = (int*)malloc(bytes);
//Maximum iterations to test for escape
const int maxIterations = 255;
int iterations = 0;
double x = 0;
double y = 0;
double xtmp = 0;
double x0;
double y0;
#pragma acc region copyout(grid[0:pixelsX*pixelsY-1])
{
for(pixelY=0; pixelY<pixelsY; pixelY++) {
for(pixelX=0; pixelX<pixelsX; pixelX++) {
y = maxY - pixelY*spacing;
x = minX + pixelX*spacing;
iterations = 0;
x0 = x;
y0 = y;
while(x*x + y*y <= 4 && iterations < maxIterations) {
xtmp = x*x - y*y + x0;
y = 2*x*y + y0;
x = xtmp;
iterations++;
}
//Write number of iterations required to array
grid[pixelY*pixelsX + pixelX] = iterations;
}
}
}
return 0;
}
Hi AdamSimpson,
It’s the real comparison in the while loop that’s causing the problem. I’ve modified the code below to get it to accelerate. Note that the ‘independent’ clause are necessary for grid’s computed index.
Hope this helps,
Mat
#pragma acc region copyout(grid[0:pixelsX*pixelsY-1])
{
#pragma acc for independent
for(pixelY=0; pixelY<pixelsY; pixelY++) {
#pragma acc for independent, private(iterations)
for(pixelX=0; pixelX<pixelsX; pixelX++) {
y = maxY - pixelY*spacing;
x = minX + pixelX*spacing;
iterations = 0;
x0 = x;
y0 = y;
//while(x*x + y*y <= 4 && iterations < maxIterations) {
while(iterations < maxIterations) {
if (x*x + y*y <= 4) {
break;
} else {
xtmp = x*x - y*y + x0;
y = 2*x*y + y0;
x = xtmp;
iterations++;
}
}
//Write number of iterations required to array
grid[pixelY*pixelsX + pixelX] = iterations;
}
}
}
Thanks Mat,
That does compile but does not produce the correct result. The below is the smallest example case I could make and when compiled with -ta=host I get 9 for all of the values in grid as expected, when compiled for -ta=nvidia I get 0 for all the values. any ideas? Is it a problem with version 10.9? Thanks
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
int main(int argc, char *argv[])
{
//Number of grid points in x and y direction
const int pixelsX = 10;
const int pixelsY = 10;
size_t bytes = pixelsX*pixelsY*sizeof(int);
//Allocate grid to hold number of itereations for escape
int *grid = (int*)malloc(bytes);
int iterations;
int pixelY, pixelX;
#pragma acc region copyout(grid[0:pixelsX*pixelsY-1])
{
#pragma acc for independent
for(pixelY=0; pixelY<pixelsY; pixelY++) {
#pragma acc for independent, private(iterations)
for(pixelX=0; pixelX<pixelsX; pixelX++) {
iterations = 0;
while(iterations < 100)
{
if(pixelX > 8) {
break;
}
else {
iterations++;
}
}
//Write number of iterations required to array
grid[pixelY*pixelsX + pixelX] = iterations;
}
}
}
for(pixelY=0; pixelY<pixelsY; pixelY++){
for(pixelX=0; pixelX<pixelsX; pixelX++) {
printf("%d\n",grid[pixelY*pixelsX + pixelX]);
}
}
return 0;
}
Is it a problem with version 10.9?
Looks like it. The good news that it was resolved in PGI 2011. Most likely related to a problem that I found internally in a similar Fortran code.
AdamSimpson% pgcc -ta=nvidia,keepgpu -fast testACC2.c -Minfo -V11.2 ; a.out
main:
19, Generating copyout(grid[:pixelsY*pixelsX-1])
Generating compute capability 1.0 binary
Generating compute capability 1.3 binary
Generating compute capability 2.0 binary
22, Loop is parallelizable
24, Loop is parallelizable
Accelerator kernel generated
22, #pragma acc for parallel, vector(10) /* blockIdx.y threadIdx.y */
24, #pragma acc for parallel, vector(10) /* blockIdx.x threadIdx.x */
CC 1.0 : 7 registers; 48 shared, 12 constant, 0 local memory bytes; 100% occupancy
CC 1.3 : 7 registers; 48 shared, 12 constant, 0 local memory bytes; 100% occupancy
CC 2.0 : 11 registers; 8 shared, 56 constant, 0 local memory bytes; 66% occupancy
43, Loop not vectorized/parallelized: contains call
0 0 100
0 1 100
0 2 100
0 3 100
0 4 100
0 5 100
0 6 100
0 7 100
0 8 100
0 9 0
1 0 100
1 1 100
1 2 100
1 3 100
1 4 100
1 5 100
1 6 100
1 7 100
1 8 100
1 9 0
2 0 100
2 1 100
2 2 100
2 3 100
2 4 100
2 5 100
2 6 100
2 7 100
2 8 100
2 9 0
3 0 100
3 1 100
3 2 100
3 3 100
3 4 100
3 5 100
3 6 100
3 7 100
3 8 100
3 9 0
4 0 100
4 1 100
4 2 100
4 3 100
4 4 100
4 5 100
4 6 100
4 7 100
4 8 100
4 9 0
5 0 100
5 1 100
5 2 100
5 3 100
5 4 100
5 5 100
5 6 100
5 7 100
5 8 100
5 9 0
6 0 100
6 1 100
6 2 100
6 3 100
6 4 100
6 5 100
6 6 100
6 7 100
6 8 100
6 9 0
7 0 100
7 1 100
7 2 100
7 3 100
7 4 100
7 5 100
7 6 100
7 7 100
7 8 100
7 9 0
8 0 100
8 1 100
8 2 100
8 3 100
8 4 100
8 5 100
8 6 100
8 7 100
8 8 100
8 9 0
9 0 100
9 1 100
9 2 100
9 3 100
9 4 100
9 5 100
9 6 100
9 7 100
9 8 100
9 9 0