Hello,
My program has few FOR loops which can be parallelized.
With aproximation this is how the program works and the limits of FORs.
void function(int m)
{
CT
}
int main(void)
{
for (m = 1; m < 5; m++)
{
function(m);
for (p = 1; p < 8; p++)
for (lu = 1; lu < 251; lu++)
{
for (s = 0; s < 3; s++)
for (a = 0; a < 3; a++)
{
for (i = 1; i < 365; i++)
for (v = 1; v < 10; v++)
use CT
for (j = 1; j < 40; j++)
if minim
RES[m][p][0] = ...
}
for (t = 0; t < 3; t++)
for (c = 0; c < 85; c++)
{
for (i = 1; i < 365; i++)
for (v = 1; v < 10; v++)
use CT
for (j = 1; j < 40; j++)
if minim
RES[m][p][1] = ...
}
}
}
}
I have a combined question, software + hardware.
How you can see, the final array of results depends only on m and p from first and second FORs, all other loops must be executed completely to write a value in RES.
I have an AMD with 8 physical cores and a GTX 690 2 x 1500 cuda cores.
I need your help how to parallelize this program OMP + ACC to use the hardware resources well.
My problems are:
- I don’t know what I have to write in every #pragma to make RES depending only on m and p and all other to be executed completlly
- maybe I can add an OMP NOWAIT to “lu FOR” loop and delete OMP from “p FOR” loop, this should avoid writing the dependency on “p” and a OMP atomic at RES
- where should I activate the second GPU ? the loops are a bit unbalanced because “c” goes till 85 since “a” only to 3
Bellow is a try as a beginner, any suggestion is welcome.
void function(int m)
{
CT
}
int main(void)
{
for (m = 1; m < 5; m++)
{
function(m);
#pragma acc data copyin(CT) copyout(RES)
{
#pragma omp parallel for schedule(dynamic) shared(??) default(none)
for (p = 1; p < 8; p++)
for (lu = 1; lu < 251; lu++)
#pragma acc region
{
#pragma acc loop independent vector(16)
for (s = 0; s < 3; s++)
#pragma acc loop independent vector(16)
for (a = 0; a < 3; a++)
{
#pragma acc loop independent vector(16)
for (i = 1; i < 365; i++)
for (v = 1; v < 10; v++)
use CT
for (j = 1; j < 40; j++)
#pragma acc atomic
if minim
RES[m][p][0] = ...
}
#pragma acc loop independent vector(16)
for (t = 0; t < 3; t++)
#pragma acc loop independent vector(16)
for (c = 0; c < 85; c++)
{
#pragma acc loop independent vector(16)
for (i = 1; i < 365; i++)
for (v = 1; v < 10; v++)
use CT
for (j = 1; j < 40; j++)
#pragma acc atomic
if minim
RES[m][p][1] = ...
}
}
}
}
}