Hi mkcolg, thanks for your reply.
The error is really caused by “capacity”, and it seems that everything is fine after adding #pragma acc atomic capture to my code.
But there are still some warnings during compilation.
My whole code is shown below.
#include "timer.h" // for time record
#define RN_MULT 3512401965023503517
#define RN_ADD ((uint64_t) 0)
#define RN_MOD ((uint64_t) 1 << 63)
#define RN_MASK (((uint64_t)1 << 63) - 1)
#define getrand(seed) ((double) (seed) / (double) (RN_MOD))
#define update_seed(seed) (((RN_MULT * seed) & RN_MASK) + RN_ADD) & RN_MASK
typedef struct {
double position;
double direction;
double wgt;
int cell;
int group;
uint64_t seed;
int alive;
} Particle;
typedef struct {
double SigmaT[3];
double SigmaA[3];
double NuSigmaF[3];
double SigmaS[5];
int fissionable; /* 0: no 1:yes */
} Material;
typedef struct {
int mat;
double left;
double right;
/* bc=0: normal, bc=1: vacuum, bc=2: reflective */
int leftbc;
int rightbc;
} Cell;
int main(void) {
StartTimer();
const int Neutron = 100000;
const int Batch = 100;
Cell InpCell[4];
Material InpMat[3];
Cell cell0 = {0, 0., 0., 0, 0};
Cell cell1 = {2, 0., 30., 2, 0};
Cell cell2 = {1, 30., 130., 0, 0};
Cell cell3 = {2, 130., 160., 0, 2};
InpCell[0] = cell0;
InpCell[1] = cell1;
InpCell[2] = cell2;
InpCell[3] = cell3;
Material mat0 = {{0., 0., 0.}, // SigmaT
{0., 0., 0.}, // SigmaA
{0., 0., 0.}, // NuSigmaF
{0., 0., 0., 0., 0.}, // SigmaS
0}; // fissionable
Material mat1 = {{0., 2.412957E-01, 9.134388E-01},
{0., 9.078832E-03, 7.483482E-02},
{0., 5.688290E-03, 1.047830E-01},
{0., 2.165434E-01, 1.567340E-02, 0., 8.386040E-01},
1};
Material mat2 = {{0., 2.604754E-01, 1.239454E+00},
{0., 2.043440E-03, 2.747390E-02},
{0., 0., 0.},
{0., 2.296604E-01, 2.877160E-02, 0., 1.211980E+00},
0};
InpMat[0] = mat0;
InpMat[1] = mat1;
InpMat[2] = mat2;
int capacity = 0;
int size = Neutron;
int tot = (int) (1.3 * size);
Particle *par = (Particle *) malloc(size * sizeof(Particle));
Particle *next = (Particle *) malloc(tot * sizeof(Particle));
/* allocate initial random number seed */
par[0].seed = update_seed((uint64_t) 1);
for (int a = 1; a < Neutron; a++){
par[a].seed = update_seed(par[a - 1].seed);
for (int b = 0; b < 1000; b++){
par[a].seed = update_seed(par[a].seed);
}
}
uint64_t last_seed = par[Neutron - 1].seed;
par[0].position = 80.0;
par[0].direction = 2.0 * getrand(par[0].seed) - 1.0;
par[0].seed = update_seed(par[0].seed);
par[0].wgt = 1.0;
par[0].cell = 2;
par[0].group = getrand(par[0].seed) > 0.5 ? 1 : 2;
par[0].seed = update_seed(par[0].seed);
par[0].alive = 1;
for (int i = 1; i < Neutron; i++){
par[i].position = 80.0;
par[i].direction = 2.0 * getrand(par[i].seed) - 1.0;
par[i].seed = update_seed(par[i].seed);
par[i].wgt = 1.0;
par[i].cell = 2;
par[i].group = 2 * getrand(par[i].seed) > 1.0 ? 1 : 2;
par[i].seed = update_seed(par[i].seed);
par[i].alive = 1;
}
double global_tracklength;
double global_collision;
double global_absorption;
double d_collision;
double d_boundary;
double distance;
double k_tracklength;
double k_collision;
double k_absorption;
double keff = 1.0;
printf("Gen k-tra k-abs k-col keff size\n");
int gen;
for (gen = 1; gen <= Batch; gen++){
global_tracklength = 0.;
global_collision = 0.;
global_absorption = 0.;
int CellID, GroupID, MatID;
int k;
#pragma acc parallel copy(capacity) copyin(par[0:size], InpCell[0:4], InpMat[0:3]) copyout(next[0:tot])
#pragma acc loop reduction(+:global_collision, global_absorption, global_tracklength)
for (k = 0; k < size; k++){
int mycap;
while (par[k].alive){
CellID = par[k].cell;
GroupID = par[k].group;
MatID = InpCell[CellID].mat;
double SigmaS = InpMat[MatID].SigmaS[2 * GroupID - 1] + InpMat[MatID].SigmaS[2 * GroupID];
/* find distance to the nearest boundary */
d_boundary = ((par[k].direction > 0 ? InpCell[CellID].right : InpCell[CellID].left) -
par[k].position) / par[k].direction;
/* sample a distance to collision */
d_collision = -log(getrand(par[k].seed)) / InpMat[MatID].SigmaT[GroupID];
par[k].seed = update_seed(par[k].seed);
/* select the smaller between the two distances*/
distance = d_boundary < d_collision ? d_boundary : d_collision;
/* adjust the position of the particle */
par[k].position += distance * par[k].direction;
/* score track-length estimate of keff */
global_tracklength += par[k].wgt * distance * InpMat[MatID].NuSigmaF[GroupID];
/* particle crosses surface */
if (d_boundary < d_collision){
if (par[k].direction > 0.0){
if (InpCell[CellID].rightbc == 1){
par[k].alive = 0;
}
else if (InpCell[CellID].rightbc == 2){
par[k].direction = -par[k].direction;
}
else{
par[k].cell += 1;
}
}
else{
if (InpCell[CellID].leftbc == 1){
par[k].alive = 0;
}
else if (InpCell[CellID].leftbc == 2){
par[k].direction = -par[k].direction;
}
else{
par[k].cell -= 1;
}
}
}
/* or particle has collision */
else{
/* score collision estimate of keff */
global_collision += par[k].wgt * InpMat[MatID].NuSigmaF[GroupID] / InpMat[MatID].SigmaT[GroupID];
/* sample reaction for the material the particle is in */
if (InpMat[MatID].fissionable){
/* determine expected number of neutrons produced */
double nu_t = par[k].wgt / keff * InpMat[MatID].NuSigmaF[GroupID] / InpMat[MatID].SigmaT[GroupID];
int nu = getrand(par[k].seed) > nu_t - (int) nu_t ? (int) nu_t : (int) nu_t + 1;
par[k].seed = update_seed(par[k].seed);
int m;
for (m = 0; m < nu; m++){
#pragma acc atomic capture
mycap = capacity++;
next[mycap].position = par[k].position;
next[mycap].wgt = 1.0;
next[mycap].group = getrand(par[k].seed) < 0.9 ? 1 : 2;
par[k].seed = update_seed(par[k].seed);
next[mycap].direction = getrand(par[k].seed) * 2.0 - 1.0;
par[k].seed = update_seed(par[k].seed);
next[mycap].cell = par[k].cell;
next[mycap].alive = 1;
}
}
/* determine weight absorbed in survival biasing */
double absorb_wgt = par[k].wgt * InpMat[MatID].SigmaA[GroupID] / InpMat[MatID].SigmaT[GroupID];
par[k].wgt -= absorb_wgt;
/* score implicit absorption estimate of keff */
global_absorption +=
absorb_wgt * InpMat[MatID].NuSigmaF[GroupID] / InpMat[MatID].SigmaA[GroupID];
/* sample a scattering reaction and determine the secondary energy of the exiting neutron */
double inf = 0, sup = 0;
double roulette = getrand(par[k].seed);
par[k].seed = update_seed(par[k].seed);
int n;
for (n = 1; n <= 2; n++){
inf = sup;
sup = inf + InpMat[MatID].SigmaS[(GroupID - 1) * 2 + n];
if (roulette > inf / SigmaS && roulette < sup / SigmaS){
par[k].group = n;
par[k].direction = getrand(par[k].seed) * 2.0 - 1.0;
par[k].seed = update_seed(par[k].seed);
}
}
/* play russian roulette */
if (par[k].wgt < 0.25){
double tmp = getrand(par[k].seed);
par[k].seed = update_seed(par[k].seed);
if (tmp < par[k].wgt / 1.0){
par[k].wgt = 1.0;
}
else{
par[k].alive = 0;
}
}
}
}
}
k_tracklength = global_tracklength / (double) size;
k_collision = global_collision / (double) size;
k_absorption = global_absorption / (double) size;
keff = (k_tracklength + k_collision + k_absorption) / 3;
next[0].seed = update_seed(last_seed);
for (int a = 0; a < 1000; a++){
next[0].seed = update_seed(next[0].seed);
}
for (int a = 1; a < capacity; a++){
next[a].seed = update_seed(next[a - 1].seed);
for (int b = 0; b < 1000; b++){
next[a].seed = update_seed(next[a].seed);
}
}
last_seed = next[capacity - 1].seed;
size = capacity;
capacity = 0;
tot = (int) (1.3 * size);
printf(" %-3d %0.5f %0.5f %0.5f %0.5f %d\n", gen, k_tracklength, k_absorption, k_collision,
keff, size);
free(par);
par = next;
next = (Particle *) malloc(tot * sizeof(Particle));
}
free(par);
free(next);
double total_time = GetTimer();
printf("total time: %0.5fs\n", total_time / 1000.0);
return 0;
}
I’m using nVIDIA GTX 680 and compiling the code with pgcc -acc -Minfo=accel -ta=tesla:cc30,time -O0 main.c -o test
Here are the feedback messages
main:
109, Generating copyin(InpCell[:],InpMat[:])
Generating copy(capacity)
Generating copyout(next[:tot])
Generating copyin(par[:size])
Accelerator kernel generated
Generating Tesla code
111, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
Generating reduction(+:global_absorption,global_collision,global_tracklength)
113, #pragma acc loop seq
174, #pragma acc loop seq
201, #pragma acc loop seq
113, Loop carried dependence of par->alive,par->cell,par->direction,par->group,par->position,par->seed,par->wgt prevents parallelization
Complex loop carried dependence of InpMat.NuSigmaF,InpMat.SigmaA,InpMat.SigmaS,InpMat.SigmaT,InpMatfissionable,next->alive,next->cell,next->direction,next->group,next->position,next->wgt,par->alive,par->cell,par->direction,par->group,par->position,par->seed,par->wgt prevents parallelization
Loop carried reuse of next->alive,next->cell,next->direction,next->group,next->position,next->wgt prevents parallelization
Loop carried backward dependence of par->alive,par->cell,par->direction,par->group,par->position,par->seed,par->wgt prevents vectorization
Loop carried dependence of par->direction,par->group,par->position,par->seed prevents vectorization
174, Loop carried dependence of par->seed prevents parallelization
Complex loop carried dependence of next->alive,next->cell,next->direction,next->group,next->position,next->wgt,par->cell,par->position,par->seed prevents parallelization
Loop carried reuse of next->position prevents parallelization
Loop carried backward dependence of par->seed prevents vectorization
201, Loop carried dependence of par->group prevents parallelization
Complex loop carried dependence of InpMat.SigmaS,par->direction,par->group,par->seed prevents parallelization
Loop carried backward dependence of par->group prevents vectorization
Loop carried scalar dependence for sup at line 202
The good news is that the executable runs normally, while the bad one is the results are quite different from the CPU version.
So I wanna know whether there is any connection between the compiler feedback message and the wrong results. Or for other reasons such as RNG function?