My GPU is a gtx 1060, and yes it has less cores than the Tesla k40.

when I compile, use maxregcount =32

and the “width_image” and “height” depend on the image, but usually it’s 581x429

this is my code if it’s needed, because I removed the loops and used “collapse(3)” and the code didn’t finish executing for more than a minute

#pragma acc data copy(label[:height][:width])

for(int z=0; z<252; z++)

{

#pragma acc kernels loop

for (int i=1; i<height; i++)

{

#pragma acc loop

for (int j=0; j < (width); j++)

{

if(label_[j] >0 && label[i-1][j-1] >0)

{

label[j] = min (label[j],label[i-1][j-1]);

label[i-1][j-1] = min (label[j],label[i-1][j-1]);

}

if(label[j] >0 && label[i-1][j] >0)

{

label[j] = min (label[j],label[i-1][j]);

label[i-1][j] = min (label[j],label[i-1][j]);

}

if(label[j] >0 && label[i-1][j+1] >0)

{

label[j] = min (label[i][j],label[i-1][j+1]);

label[i-1][j+1] = min (label[i][j],label[i-1][j+1]);

}

if(label[i][j] >0 && label[i][j-1] >0)

{

label[i][j] = min (label[i][j],label[i][j-1]);

label[i][j-1] = min (label[i][j],label[i][j-1]);

}

if(label[i][j] >0 && label[i][j+1] >0)

{

label[i][j] = min (label[i][j],label[i][j+1]);

label[i][j+1] = min (label[i][j],label[i][j+1]);

}

if(label[i][j] >0 && label[i+1][j+1] >0)

{

label[i][j] = min (label[i][j],label[i+1][j+1]);

label[i+1][j+1] = min (label[i][j],label[i+1][j+1]);

}

if(label[i][j] >0 && label[i+1][j] >0)

{

label[i][j] = min (label[i][j],label[i+1][j]);

label[i+1][j] = min (label[i][j],label[i+1][j]);

}

if(label[i][j] >0 && label[i+1][j-1] >0)

{

label[i][j] = min (label[i][j],label[i+1][j-1]);

label[i+1][j-1] = min (label[i][j],label[i+1][j-1]);

}

}

}

}_