__global__ void nblist_hcp_dev(int *Ipcomplex, int *Ipstrand, int *Ipres, REAL_T *x, REAL_T *x_hcp1, REAL_T *x_hcp2, REAL_T *x_hcp3, int *temp_hcp0, int *temp_hcp1, int *temp_hcp2, int *temp_hcp3, int *temp_hcp0_n, int *temp_hcp1_n, int *temp_hcp2_n, int *temp_hcp3_n, int a)
{
int c1, s1, r1, a1, s1_from, s1_to, r1_from, r1_to, a1_from, a1_to;
REAL_T dist2;
int index=blockIdx.x*blockDim.x + threadIdx.x;
extern __shared__ float4 x_cord[];
a = a + __mul24(blockIdx.x, __mul24(blockDim.x, blockDim.y)) + __mul24(blockDim.x, threadIdx.y) + threadIdx.x;
if (a>=Natom)
return;
x_cord[threadIdx.x].x=x[3*a+0];
x_cord[threadIdx.x].y=x[3*a+1];
x_cord[threadIdx.x].z=x[3*a+2];
x_cord[threadIdx.x].w=0.0;
__syncthreads();
for (c1 = 0; c1 < Ncomplex; c1++) /* for each complex */
{
dist2 = 0.0;
dist2 = calc_dist2_1(x_cord[threadIdx.x].x, x_cord[threadIdx.x].y, x_cord[threadIdx.x].z, x_hcp3[3*c1+0], x_hcp3[3*c1+1], x_hcp3[3*c1+2]);
if (dist2 > dist2_hcp3)
{
temp_hcp3[index*Ncomplex+temp_hcp3_n[index]] = c1;
(temp_hcp3_n[index])++;
}
else
{
s1_from = Ipcomplex[c1] - 1;
if (c1 < Ncomplex - 1) { s1_to = Ipcomplex[c1 + 1] - 1; }
else { s1_to = Nstrand; }
for (s1 = s1_from; s1 < s1_to; s1++) /* for each other strand */
{
dist2 = 0;
dist2 = calc_dist2_1(x_cord[threadIdx.x].x, x_cord[threadIdx.x].y, x_cord[threadIdx.x].z, x_hcp2[3*s1+0], x_hcp2[3*s1+1], x_hcp2[3*s1+2]);
if (dist2 > dist2_hcp2)
{
temp_hcp2[index*Nstrand+temp_hcp2_n[index]] = s1;
(temp_hcp2_n[index])++;
}
else
{
r1_from = Ipstrand[s1] - 1;
if (s1 < Nstrand - 1) { r1_to = Ipstrand[s1 + 1] - 1; }
else { r1_to = Nres; }
for (r1 = r1_from; r1 < r1_to; r1++) /* for each other residue */
{
dist2 = 0;
dist2 = calc_dist2_1(x_cord[threadIdx.x].x, x_cord[threadIdx.x].y, x_cord[threadIdx.x].z, x_hcp1[3*r1+0], x_hcp1[3*r1+1], x_hcp1[3*r1+2]);
if (dist2 > dist2_hcp1)
{
temp_hcp1[index*Nres+temp_hcp1_n[index]] = r1;
(temp_hcp1_n[index])++;
}
else
{
a1_from = Ipres[r1] - 1;
if (r1 < Nres - 1) { a1_to = Ipres[r1 + 1] - 1; }
else { a1_to = Natom; }
for (a1 = a1_from; a1 < a1_to; a1++) /* for each atoms */
{
if (a != a1)
{
temp_hcp0[index*Natom+temp_hcp0_n[index]] = a1;
(temp_hcp0_n[index])++;
} /* not same atom */
} /* end for other atoms */
} /* end else residue inside threshold dist */
} /* end for other residues */
} /* end else strand inside threshold dist */
} /* end for other strand */
} /* end else complex inside threshold dist */
} /* end for other complex */
}
This is the 1st kernel. Its using 7 registers and 140 bytes of lmem. I am using just 64 threads per block (1 D) and 120 blocks per grid (1 D).
The 2nd kernel is quite big. I guess if I can somehow manage to reduce lmem usage in this case, then I would be able to do it for the other one also.
Thanks a lot for all your efforts.