Hi,
To be sure that all arrays are contiguous and fastest accessed, I want to make flat all arrays in my program.
Before to implement it, I’ve done a test for 5D array.
#include<stdio.h>
#include<stdlib.h>
#define flat5(m0, u4, u3, u2, u1, l5, l4, l3, l2, l1, i5, i4, i3, i2, i1) m0[(i5-l5) * (u4-l4) * (u3-l3) * (u2-l2) * (u1-l1) + (i4-l4) * (u3-l3) * (u2-l2) * (u1-l1) + (i3-l3) * (u2-l2) * (u1-l1) + (i2-l2) * (u1-l1) + (i1-l1)]
int l5, l4, l3, l2, l1, u5, u4, u3, u2, u1;
void prin(int *m, int u5, int u4, int u3, int u2, int u1, int l5, int l4, int l3, int l2, int l1) {
for(int i5 = l5; i5 < u5; i5++) {
for(int i4 = l4; i4 < u4; i4++) {
for(int i3 = l3; i3 < u3; i3++) {
for(int i2 = l2; i2 < u2; i2++) {
for(int i1 = l1; i1 < u1; i1++) {
printf("\t");
printf("%d", flat5(m, u4, u3, u2, u1, l5, l4, l3, l2, l1, i5, i4, i3, i2, i1));
}
printf("\n");
}
printf(" %d\n", i3);
}
printf(" %d ----------------\n", i4);
}
printf("%d ------------------------------\n", i5);
}
}
#pragma acc routine // line26
void gpu(int *restrict m, int u4, int u3, int u2, int u1, int l5, int l4, int l3, int l2, int l1, int i5, int i4, int i3, int i2, int i1) { // line 27
//void gpu(int *restrict m, int i5, int i4, int i3, int i2, int i1) { //line 28
flat5(m, u4, u3, u2, u1, l5, l4, l3, l2, l1, i5, i4, i3, i2, i1) = 10000*i5 + 1000*i4 + 100*i3 + 10*i2 + i1;
}
int main(void) {
l5 = 1, l4 = 1, l3 = 2, l2 = 1, l1 = 0; //lower limits
u5 = 3, u4 = 2, u3 = 4, u2 = 5, u1 = 3; //upper limits
int *m = (int *)calloc((u5 - l5) * (u4 - l4) * (u3 - l3) * (u2 - l2) * (u1 - l1), sizeof(int));
#pragma acc data copyout(m[0:(u5 - l5) * (u4 - l4) * (u3 - l3) * (u2 - l2) * (u1 - l1)])
{
#pragma acc parallel loop gang collapse(2)
for(int i5 = l5; i5 < u5; i5++)
for(int i4 = l4; i4 < u4; i4++)
#pragma acc loop worker collapse(2)
for(int i3 = l3; i3 < u3; i3++)
for(int i2 = l2; i2 < u2; i2++)
#pragma acc loop vector
for(int i1 = l1; i1 < u1; i1++)
gpu(m, u4, u3, u2, u1, l5, l4, l3, l2, l1, i5, i4, i3, i2, i1); //line 48
//gpu(m, i5, i4, i3, i2, i1); // line 49
}
//prin(m, u5, u4, u3, u2, u1, l5, l4, l3, l2, l1);
free(m);
return 0;
}
first case - openacc
lines 26, 27, 48 uncomented and lines 28, 49 commented
gpu(int *, int, int, int, int, int, int, int, int, int, int, int, int, int, int):
33, Generating acc routine seq
Generating Tesla code
main:
47, Generating copyout(m[:(u1-l1)*((u2-l2)*((u3-l3)*((u4-l4)*(u5-l5))))])
Accelerator kernel generated
Generating Tesla code
49, #pragma acc loop gang collapse(2) /* blockIdx.x */
50, /* blockIdx.x collapsed */
52, #pragma acc loop worker(4) collapse(2) /* threadIdx.y */
53, /* threadIdx.y collapsed */
55, #pragma acc loop vector(32) /* threadIdx.x */
52, Loop is parallelizable
53, Loop is parallelizable
55, Loop is parallelizable
Question_1
Do I need to send limits u4, u3 … l2, l1 to the function when they are global declared ?
Are they not firstprivate in “#pragma acc data” by default?
To send over 30 parameter in every function (in my program) a bit too much.
Question_2
I’ve tried “pragma acc declare” I saw some posts that is not ready yet, is it working in version in 17.10 ?
second case - (it is running only in CPU)
lines 26, 27, 48 commented and lines 28, 49 un-commented
Question_3
Just missing “line 33 Generating Tesla code” in the output, tell that is running in CPU, am I right ?