Thanks for the thoughts. I finally got around to eliminating that spurious zc copy, and, as expected, that doesn’t fix anything. The zc is only used on the cpu as you can see, since I haven’t ellided any of the code in between on this function. However, it was a good motivation to eliminate that ugly piece of code.
The reason this code is so hard to debug is that it is auto-generated as part of a shared object that relies on third-party software in order to function properly, so it’s very hard to isolate it out of that environment.
Here’s the ai() function:
V aa(A*a,I tp){frea(a);B c=1;DO(i,a->r)c*=a->s[i];B z=0;
B pc=8*ceil(c/8.0);
switch(tp){
case 1:z=sizeof(I)*pc;break;
case 2:z=sizeof(D)*pc;break;
case 3:z=ceil((sizeof(U8)*pc)/8.0);break;
default: error(16);}
z=4*ceil(z/4.0);char*v=malloc(z);if(NULL==v)error(1);
#ifdef _OPENACC
#pragma acc enter data create(v[:z])
#endif
a->v=v;a->z=z;a->c=c;a->f=2;}
V ai(A*a,I r,B *s,I tp){a->r=r;DO(i,r)a->s[i]=s[i];aa(a,tp);}
It’s a fairly straightforward allocation and simply makes sure that the array allocated is also allocated on the GPU.
I was also thinking that zvi and rvi must be miscalculated somehow, but here are a few examples of codes and outputs that make me think otherwise:
static void fn_1_1ii(A*z,A*l,A*r,A*penv[]){
A env0[1];A*env[]={env0,penv[0]};
DO(i,1)env0[i].v=NULL;
{B zc=1,rc=1,lc=1;
A *rslt=&env[0][0];A *rgt=r;A *lft=l;
I rr=rgt->r;I lr=lft->r;
B*restrict rs=rgt->s;B*restrict ls=lft->s;
aplint32 *restrict rv=(rgt)->v;aplint32 *restrict lv=(lft)->v;
I zr;B zs[15];
if(rr!=0&&lr!=0&&abs(rr-lr)>1)error(4);int minr=rr>lr?lr:rr;
if(lr==rr&&rr>0){I n=rr-1;DO(i,n)if(rs[i+1]!=ls[i+1])error(5);}
else if(lr<rr){DO(i,lr)if(ls[i]!=rs[i+1])error(5);}
else{DO(i,rr)if(ls[i+1]!=rs[i])error(5);}
zs[0]=1;if(lr>rr){zr=lr;DO(i,lr)zs[i]=ls[i];}
else{zr=rr;DO(i,rr)zs[i]=rs[i];}
zr=zr==0?1:zr;zs[0]+=minr==zr?ls[0]:1;
ai(rslt,zr,zs,1);
aplint32 *restrict zv=(rslt)->v;
DO(i,zr)zc*=zs[i];DO(i,lr)lc*=ls[i];DO(i,rr)rc*=rs[i];
I zcp=zc;I rcp=rc;I lcp=lc;
I lt=lr!=0;I rt=rr!=0;zc/=zc==0?1:zs[0];rc=rr==0?zc:rc;lc=lr==0?zc:lc;
#pragma acc kernels loop present(zv[:zcp],lv[:lcp])
DO(i,lc){I lvi=lt*i;zv[i]=lv[lvi];}
#pragma acc kernels loop independent present(zv[:zcp],rv[:rcp])
DO(i,100){I zvi=lc+i;I rvi=rt*i;zv[i]=i;}
}
cpaa(z,&env[0][0]);
fe(&env0[1],0);}
Notice here that I specifically avoid using zvi or rvi, and I also use 100 instead of rc as the loop bounds, and this is the result I get:
0 1 2 3 4 5 6 7 8 9
10 11 12 13 14 15 16 17 18 19
20 21 22 23 24 25 26 27 28 29
30 31 32 33 34 35 36 37 38 39
40 41 42 43 44 45 46 47 48 49
50 51 52 53 54 55 56 57 58 59
60 61 62 63 64 65 66 67 68 69
70 71 72 73 74 75 76 77 78 79
80 81 82 83 84 85 86 87 88 89
90 91 92 93 94 95 96 97 98 99
0 585 ¯1 0 0 0 0 0 0 586
¯1 0 0 0 0 0 0 587 ¯1 0
0 0 0 0 0 588 ¯1 0 0 0
~
In fact, this is exactly what I expect to get in this case. But now, let’s use rc instead of using 100, but still avoid using zvi or rvi:
static void fn_1_1ii(A*z,A*l,A*r,A*penv[]){
A env0[1];A*env[]={env0,penv[0]};
DO(i,1)env0[i].v=NULL;
{B zc=1,rc=1,lc=1;
A *rslt=&env[0][0];A *rgt=r;A *lft=l;
I rr=rgt->r;I lr=lft->r;
B*restrict rs=rgt->s;B*restrict ls=lft->s;
aplint32 *restrict rv=(rgt)->v;aplint32 *restrict lv=(lft)->v;
I zr;B zs[15];
if(rr!=0&&lr!=0&&abs(rr-lr)>1)error(4);int minr=rr>lr?lr:rr;
if(lr==rr&&rr>0){I n=rr-1;DO(i,n)if(rs[i+1]!=ls[i+1])error(5);}
else if(lr<rr){DO(i,lr)if(ls[i]!=rs[i+1])error(5);}
else{DO(i,rr)if(ls[i+1]!=rs[i])error(5);}
zs[0]=1;if(lr>rr){zr=lr;DO(i,lr)zs[i]=ls[i];}
else{zr=rr;DO(i,rr)zs[i]=rs[i];}
zr=zr==0?1:zr;zs[0]+=minr==zr?ls[0]:1;
ai(rslt,zr,zs,1);
aplint32 *restrict zv=(rslt)->v;
DO(i,zr)zc*=zs[i];DO(i,lr)lc*=ls[i];DO(i,rr)rc*=rs[i];
I zcp=zc;I rcp=rc;I lcp=lc;
I lt=lr!=0;I rt=rr!=0;zc/=zc==0?1:zs[0];rc=rr==0?zc:rc;lc=lr==0?zc:lc;
#pragma acc kernels loop present(zv[:zcp],lv[:lcp])
DO(i,lc){I lvi=lt*i;zv[i]=lv[lvi];}
#pragma acc kernels loop independent present(zv[:zcp],rv[:rcp])
DO(i,rc){I zvi=lc+i;I rvi=rt*i;zv[i]=i;}
}
cpaa(z,&env[0][0]);
fe(&env0[1],0);}
And the results:
0 1 2 3 4 5 6 7 8 9
10 11 12 13 14 15 16 17 18 19
20 21 22 23 24 25 26 27 28 29
4096 4 0 0 0 ¯2147483648 2 0 0 0
4096 0 0 0 128 512 33554432 0 0 0
0 0 268435970 0 8192 0 ¯2147483648 64 0 0
262160 0 4 40960 ¯1842182637 109229574 ¯725417836 1280774144 654939278 ¯1802203980
¯828599784 ¯2042358018 ¯1509113240 ¯1940649856 900891718 ¯1501260518 2122317854 ¯1936320362 1658195078 1554025536
¯1827367295 ¯496615147 1117130880 ¯1836707308 171098672 ¯2100157438 76284120 ¯690872176 ¯2105311088 504763946
43253919 310542996 ¯2146397028 ¯1845324535 ¯1158540796 ¯1871507356 ¯1634434538 ¯795605500 1079871234 10621954
67245062 ¯1809809354 129118303 831689728 525107892 412891584 ¯1802349405 395326976 ¯2105126256 403047600
¯805006714 ¯1778301664 1475278 51643012 ¯1852682106 ¯1013807488 ¯1744371518 1410334852 153518858 817108615
143006712 190725 ¯2138943280 126919186 ¯1254951292 ¯1937334510 ¯1106865636 1141342228 1 0
~
And I tried to see what would happen if I copy instead of using present:
static void fn_1_1ii(A*z,A*l,A*r,A*penv[]){
A env0[1];A*env[]={env0,penv[0]};
DO(i,1)env0[i].v=NULL;
{B zc=1,rc=1,lc=1;
A *rslt=&env[0][0];A *rgt=r;A *lft=l;
I rr=rgt->r;I lr=lft->r;
B*restrict rs=rgt->s;B*restrict ls=lft->s;
aplint32 *restrict rv=(rgt)->v;aplint32 *restrict lv=(lft)->v;
I zr;B zs[15];
if(rr!=0&&lr!=0&&abs(rr-lr)>1)error(4);int minr=rr>lr?lr:rr;
if(lr==rr&&rr>0){I n=rr-1;DO(i,n)if(rs[i+1]!=ls[i+1])error(5);}
else if(lr<rr){DO(i,lr)if(ls[i]!=rs[i+1])error(5);}
else{DO(i,rr)if(ls[i+1]!=rs[i])error(5);}
zs[0]=1;if(lr>rr){zr=lr;DO(i,lr)zs[i]=ls[i];}
else{zr=rr;DO(i,rr)zs[i]=rs[i];}
zr=zr==0?1:zr;zs[0]+=minr==zr?ls[0]:1;
ai(rslt,zr,zs,1);
aplint32 *restrict zv=(rslt)->v;
DO(i,zr)zc*=zs[i];DO(i,lr)lc*=ls[i];DO(i,rr)rc*=rs[i];
I zcp=zc;I rcp=rc;I lcp=lc;
I lt=lr!=0;I rt=rr!=0;zc/=zc==0?1:zs[0];rc=rr==0?zc:rc;lc=lr==0?zc:lc;
#pragma acc update host(zv[:zcp],rv[:rcp],lv[:lcp])
#pragma acc kernels loop copy(zv[:zcp],lv[:lcp])
DO(i,lc){I lvi=lt*i;zv[i]=lv[lvi];}
#pragma acc kernels loop independent copy(zv[:zcp],rv[:rcp])
DO(i,rc){I zvi=lc+i;I rvi=rt*i;zv[i]=i;}
#pragma acc update device(zv[:zcp])
}
cpaa(z,&env[0][0]);
fe(&env0[1],0);}
Notice in the above that I’m still avoiding the use of zvi and rvi, and I’m just using zv_=i; Here’s what I get:_
0 1 2 3 4 5 6 7 8 9
10 11 12 13 14 15 16 17 18 19
20 21 22 23 24 25 26 27 28 29
4096 4 0 0 0 ¯2147483648 2 0 0 0
4096 0 0 0 128 512 33554432 0 0 0
0 0 268435970 0 8192 0 ¯2147483648 64 0 0
262160 0 4 40960 ¯1842182637 109229574 ¯725417836 1280774144 654939278 ¯1802203980
¯828599784 ¯2042358018 ¯1509113240 ¯1940649856 900891718 ¯1501260518 2122317854 ¯1936320362 1658195078 1554025536
¯1827367295 ¯496615147 1117130880 ¯1836707308 171098672 ¯2100157438 76284120 ¯690872176 ¯2105311088 504763946
43253919 310542996 ¯2146397028 ¯1845324535 ¯1158540796 ¯1871507356 ¯1634434538 ¯795605500 1079871234 10621954
67245062 ¯1809809354 129118303 831689728 525107892 412891584 ¯1802349405 395326976 ¯2105126256 403047600
¯805006714 ¯1778301664 1475278 51643012 ¯1852682106 ¯1013807488 ¯1744371518 1410334852 153518858 817108615
143006712 190725 ¯2138943280 126919186 ¯1254951292 ¯1937334510 ¯1106865636 1141342228 1 0
~
I’m not sure why the iteration appears to consistently stop at 30 here (which happens to be the same size as lc). There seems to be a somewhat consistent pattern here that the second loop only iterates the same as the first loop. That is, if the first loop iterates 6, and the second loop should iterate 12, then the second loop only actually fills in 6 values, despite rc == 12. This same thing happens in each case, where we only fill lc number of slots instead of rc number of slots for the second iteration.
That’s actually a very consistent pattern now that I look at it, and seems to be at the heart of the issue.