Hi Guys,
New to the forum. I’m using the text version of the cudaprofiler and I have a routine that the profiler is telling me there are 116 divergent branches in (really bad). The problem is, is that I cannot locate exactly where this is occurring since I’m not using any if-then-else statements or things of that nature. Can anyone help me point this out?
__global__ void
kernel_rhs2 (double *qre, double *qim, double *pre, double *pim,
double *theta, double *r_c)
{
int b;
int a;
b = blockIdx.y * blockDim.y + threadIdx.y;
a = blockIdx.x * blockDim.x + threadIdx.x;
double delta, r1, r2, a2, ctheta, stheta, sigma2, db2dx, ctan, csec;
double qrey = 0.0;
double qreyy = 0.0;
double qimy = 0.0;
double qimyy = 0.0;
double ll_re, ll_im;
a2 = aa * aa;
ctheta = cos (theta[b]);
stheta = sin (theta[b]);
ctan = ctheta / stheta;
csec = 1.0 / stheta;
r1 = r_c[a];
r2 = r1 * r1;
delta = r2 - 2.0 * mass * r1 + a2;
sigma2 = (r2 + a2) * (r2 + a2) - a2 * delta * stheta * stheta;
double bb = (r2 + a2) / sqrt (sigma2);
double cc = delta / sigma2;
double a_re = 2.0 * ss * (r1 * delta - mass * (r2 - a2)) / sigma2;
double a_im = (4.0 * mass * aa * r1 * mm
+ 2.0 * ss * aa * delta * ctheta) / sigma2;
db2dx = delta * pow (bb, 2) / (r2 + a2)
* (4.0 * r1 / (r2 + a2) - 1.0 / sigma2 *
(4.0 * r1 * (r2 + a2) -
a2 * pow (stheta, 2) * (2.0 * r1 - 2.0 * mass)));
double d_re =
-2.0 / r1 / sigma2 * (r1 * ss * (mass - r1) * (r2 + a2) -
(3.0 * a2 + 4.0 * r2) * delta) +
2.0 * bb * ss / sigma2 * (mass * (a2 - r2) + r1 * delta) -
0.5 * db2dx;
double d_im =
2.0 * aa * mm * (r2 + a2) / sigma2 +
2.0 * bb * aa / sigma2 * (2.0 * mm * mass * r1 +
ss * delta * ctheta);
double v_re = delta / r2 / sigma2 * (6.0 * mass * r1 * (ss + 1.0) -
r2 * (7.0 * ss + 6.0) -
6.0 * delta +
r2 * pow (ss * ctan + mm * csec,
2));
double v_im =
2.0 * aa * mm / r1 / sigma2 * (2.0 * r1 * ss * (mass - r1) -
3.0 * delta);
qrey =
(qre_c[idx (b + 1, a)] - qre_c[idx (b - 1, a)]) / (2.0 * dtheta);
qimy =
(qim_c[idx (b + 1, a)] - qim_c[idx (b - 1, a)]) / (2.0 * dtheta);
qreyy =
(qre_c[idx (b + 1, a)] + qre_c[idx (b - 1, a)] -
2.0 * qre_c[idx (b, a)]) / (dtheta * dtheta);
qimyy =
(qim_c[idx (b + 1, a)] + qim_c[idx (b - 1, a)] -
2.0 * qim_c[idx (b, a)]) / (dtheta * dtheta);
ll_re = qreyy + ctan * qrey;
ll_im = qimyy + ctan * qimy;
double qrex = (qre_h[idx (b, a + 1)] - qre_h[idx (b, a)]) / dx;
double qimx = (qim_h[idx (b, a + 1)] - qim_h[idx (b, a)]) / dx;
double prex = (pre_h[idx (b, a + 1)] - pre_h[idx (b, a)]) / dx;
double pimx = (pim_h[idx (b, a + 1)] - pim_h[idx (b, a)]) / dx;
rhs_qre[idx (b, a)] = -bb * qrex + pre_c[idx (b, a)];
rhs_qim[idx (b, a)] = -bb * qimx + pim_c[idx (b, a)];
rhs_pre[idx (b, a)] = tred[b][a] +
bb * prex
+ cc * ll_re
+ d_re * qrex - d_im * qimx
- a_re * pre_c[idx (b, a)] + a_im * pim_c[idx (b, a)]
- v_re * qre_c[idx (b, a)] + v_im * qim_c[idx (b, a)];
rhs_pim[idx (b, a)] = timd[b][a] +
bb * pimx
+ cc * ll_im
+ d_re * qimx + d_im * qrex
- a_re * pim_c[idx (b, a)] - a_im * pre_c[idx (b, a)]
- v_re * qim_c[idx (b, a)] - v_im * qre_c[idx (b, a)];
}