Computing the max value in the kernel function

Hello,

I’ve been trying to implement an algorithm that requires the computation of a few max values.
The exact same algorithm works in serial but it doesn’t work on cuda.

The code looks something like this:

/this is in the Kernel function/

float max = 0.0;
for(int i=0; i<GetElement(elem,node); i++)
{
float qq = GetEl(q, GetElement(NEList, node*maxelems+i));
if (max < qq)
max = qq;
}

q[node] = max;

After this sequence q is left unchanged leading me to believe that assigning max to q[node] is somehow illegal as a consequence of max being computed incorrectly. If I assign any other value to q[node] (1.0 for example or another float) then q changes.
I have no idea why this is the case. Any help welcome.
Thanks

D.

You are probably going to have to post a bit more of your kernel to get a definitive answer, but the obvious possibility is a memory race on q[node]. If more than one thread can write into q[node] at the same time, then the results will be completely unpredictable.

I have a thread for each node so there is only one writing going on at q[node].

And also the nodes i am reading are nodes that are not being written to because that’s what the algorithm handles.

The full Kernel function is as follows:

global void kernelSMOOTH(float *q, float x, float y, float metric, const int samecol, const int adj, const int deg, int maxdeg, const int ENList, const int * NEList, const int elem, int maxelems, int nodes, int elems,int cols,int sign,float *max)

{

int tid1 = threadIdx.x + blockDim.x*blockIdx.x; //index in samecol of node

int nid = -1;

if (tid1 >= cols) return;

int node = GetElement(samecol,tid1); // id of node

float alpha = 0.5;

//initialise max with the value of the maximum functional of the adjacent elements

float max = 0.0;

for(int i=0; i<GetElement(elem,node); i++)

{

float qq = GetEl(q, GetElement(NEList, node*maxelems+i));

if (max < qq)

  max = qq;

}

//now try to find a new position for the current node

for(int i = 0; i<GetElement(deg,node); i++)

{

//compute movement towards each neighbour 

int neigh = GetElement(adj,maxdeg*node+i);

int illegal = 0;

float xx = (1.0 - alpha) * GetEl(x,node) + alpha * GetEl(x,neigh);

float yy = (1.0 - alpha) * GetEl(y,node) + alpha * GetEl(y,neigh);

float m1 = (1.0 - alpha) * GetEl(metric,node*4+0) + alpha * GetEl(metric,neigh*4+0);

float m2 = (1.0 - alpha) * GetEl(metric,node*4+1) + alpha * GetEl(metric,neigh*4+1);

float m3 = (1.0 - alpha) * GetEl(metric,node*4+2) + alpha * GetEl(metric,neigh*4+2);

float m4 = (1.0 - alpha) * GetEl(metric,node*4+3) + alpha * GetEl(metric,neigh*4+3);

//recompute quality of the adjacent elements and get the max quality

float maxq = -1;

for(int j=0; j<GetElement(elem,node); j++)

{

  int currel = GetElement(NEList, node*maxelems + j);

  int k;

  for(k = 0; k<3; k++)

    if (GetElement(ENList,3*currel + k)==node)

      break;

  //k holds the id of my current node

  //id1 and id2 hold the ids of the 2 neighbouring nodes

  int id1 = GetElement(ENList, 3*currel + (k+1)%3);

  int id2 = GetElement(ENList, 3*currel + (k+2)%3);

  //float qq = computeQual2(GetEl(x,id1),GetEl(y,id1),&(metric[4*id1]),GetEl(x,id2),GetEl(y,id2),&(metric[4*id2]),xx,yy,m1,m2,m3,m4);

  float area = computeArea(x[id1],y[id1],x[id2],y[id2],xx,yy);

  float qq;

  if ((sign == 1 && area > 0) || (sign == -1 && area < 0))

  {

    qq = computeQual3(GetEl(x,id1),GetEl(y,id1),GetEl(metric,4*id1),G

etEl(metric,4id1+1),GetEl(metric,4id1+2),GetEl(metric,4*id1

+3),GetEl(x,id2),GetEl(y,id2),GetEl(metric,4*id2),GetEl(metri

c,4id2+1),GetEl(metric,4id2+2),GetEl(metric,4*id2+3),xx,yy,

m1,m2,m3,m4);

    if (maxq < qq)

      maxq = qq;

  }

  else

  {

    illegal = 1;

    break;

  }   

}   

//maxq holds now the max quality of this new position

//test it against the old max and see if it's better

if (illegal == 0 && maxq < GetEl(max,node) && maxq > -1)

{

  SetEl(max,node,maxq);

  nid = neigh;

}

//for this movement towards neighbour go through all it's adjacent elements and recompute the quality with the new coordinates

//find the maximum quality out of all the ones computed and put it in 'max'

//max will be a valuie that i will have to remember along with the id of the neighbour node

//i will have to compare it with the previous maxs found and if it's lower than the best functional so far then i will

//have to remember it along with the neighbour id.

}

//if nid is diffrent from -1 then it means that a new better position has been found

//if nid is -1 then the position remains the same

float newx;

float newy;

if (nid >-1 && nid <nodes)

{

newx = (1.0 - alpha) * GetEl(x,node) + alpha * GetEl(x,nid);

newy = (1.0 - alpha) * GetEl(y,node) + alpha * GetEl(y,nid);

x[node] = newx;

SetEl(y, node, newy);

for(int j = 0;j < 4; j++)

  SetEl(metric, 4*node+j, (1.0 - alpha) * GetEl(metric,node*4+j) + alpha * GetEl(metric,nid*4+j));

for(int j=0; j<GetElement(elem,node); j++)

{

  int currel = GetElement(NEList, node*maxelems + j);

  int k;

  for(k = 0; k<3; k++)

    if (GetElement(ENList, 3*currel + k)==node)

      break;

  //k holds the id of my current node

  //id1 and id2 hold the ids of the 2 neighbouring nodes

  int id1 = GetElement(ENList, 3*currel + (k+1)%3);

  int id2 = GetElement(ENList, 3*currel + (k+2)%3);

  SetEl(q, currel, computeQual3(GetEl(x,id1),GetEl(y,id1),GetEl(metric,4*id1),G

etEl(metric,4id1+1),GetEl(metric,4id1+2),GetEl(metric,4*id1

+3),GetEl(x,id2),GetEl(y,id2),GetEl(metric,4*id2),GetEl(metri

c,4id2+1),GetEl(metric,4id2+2),GetEl(metric,4*id2+3),GetEl(

x,node),GetEl(y,node),GetEl(metric,4node),GetEl(metric,4nod

e+1),GetEl(metric,4node+2),GetEl(metric,4node+3)));

} 

}

}

This works in serial but not in paralell.

Basically it doesn’t make any changes to q because i think it doesn’t compute max correctly and therefore the variable nid is also computed incorrectly. Both max and nid behave in the same way. As soon as I try to use the values in nid or max then the program doesn’t change q anymore.