Ok…
In general, i am confused. Why would i have a register overspill from shared memory?
I understand that if i use more registers than i should they will spill to local memory, but how comes it is the case with shared mem as well, and how could it be avoided?
I append some code in here, if you can have any ideas or recommendations, you are welcome. The kernel uses 256 threads and 368 blocks. The appended method spills 48 bytes.
device float checkBin_hst_vdw(volatile float4 *x_atomCoordsIn,
const float4 centralAtomIn, const int atomsInCell,
volatile vdwAtom_t vdwAt, volatile vdwAtom_t *vdwAts,
unsigned int loopStart, float d_cutoff) {
// __shared__
volatile float dx;
volatile float dy;
volatile float dz;
volatile float rab2;
volatile float temp_e = 0.0f;
volatile float R_AB6;
volatile float R_AB;
float R_AB7;
float erep;
float rab7;
volatile float epsilon;
volatile float rab;
float eattr;
volatile float e_upper;
float sumsqs;
for (unsigned int position = loopStart; position < atomsInCell; position++) {
dx = x_atomCoordsIn[position].x - centralAtomIn.x;
dy = x_atomCoordsIn[position].y - centralAtomIn.y;
dz = x_atomCoordsIn[position].z - centralAtomIn.z;
rab2 = (dx * dx + dy * dy + dz * dz);
if (rab2 < d_cutoff ) {
// vdwAtom_t vdwAts[l] = vdwAts[l];
sumsqs = (vdwAt.vdwsqrt
+ vdwAts[position].vdwsqrt);
rab = sqrtf(rab2);
R_AB = (vdwAt.R + vdwAts[position].R) * 0.5;
R_AB6 = R_AB * R_AB * R_AB * R_AB * R_AB * R_AB;
e_upper = (181.16 * vdwAt.G * vdwAts[position].G * vdwAt.alpha
* vdwAts[position].alpha);
epsilon = 0.5 * e_upper /( sumsqs* R_AB6);
// hydrogen bond donor
if (vdwAt.DA == 1) {
// hydrogen bond acceptor
if (vdwAts[position].DA == 2) {
// R_AB is scaled to 0.8 for D-A interactions. The value used in the calculation of epsilon is not scaled.
R_AB = 0.8 * R_AB;
R_AB6 *= 0.262144;
} else {
epsilon += epsilon;
}
}
// hydrogen bond donor
else if (vdwAts[position].DA == 1) {
// hydrogen bond acceptor
if (vdwAt.DA == 2) {
// R_AB is scaled to 0.8 for D-A interactions. The value used in the calculation of epsilon is not scaled.
R_AB *= 0.8;
R_AB6 *= 0.262144;
} else {
epsilon += epsilon;
}
} else {
float g_AB= (vdwAt.R - vdwAts[position].R) / (2*R_AB);
R_AB *= (1.0 + 0.2 * (1.0 - exp(-12.0 * g_AB * g_AB)));
R_AB6 = R_AB * R_AB * R_AB * R_AB * R_AB * R_AB;
epsilon =e_upper / (sumsqs * R_AB6);
}
R_AB7 = R_AB6 * R_AB;
rab7 = rab * rab2 * rab2 * rab2;
erep = (1.07 * R_AB) / (rab + 0.07 * R_AB); //***
eattr = (((1.12 * R_AB7) / (rab7 + 0.12 * R_AB7)) - 2.0);
temp_e += epsilon * erep * erep * erep * erep * erep * erep * erep
* eattr;
//------------------electrostatics-----------------
rab += 0.05;
temp_e
+= ((332.0716 * x_atomCoordsIn[position].w
* centralAtomIn.w) / rab);
//-------------------------------------------------
}
}
return temp_e;
}