I have a kernel thet calls the function below. when I compile it under CUDA 1.1 with the option --ptxas-options=-v, I see that my kernel uses 96 registers while when I compile it under CUDA 2.0 I get a register count of 60. I find this number too high especially that when I use the occupancy calculator I see that the occupancy stays very low. can someone tell me why such a kernel would be so inefficient knowing that it has zero explicit global memory access (maybe there are local memory accesses after compilation though) and how I can reduce the register count ?
thank you for your help.
Ali
device
inline real myFunc(const real t, const real T, const real hatar, const real hataw, const real b_rw, const real sigma_r, const real sigma_w,
const real theta_w, const real start_w, const real beta0, const real beta1, const real beta2, const real beta3)
{
const real oneobeta2mhatar = 1 / beta3 - hatar;
const real A_51 = 1 / (beta3 * hatar) *
(beta3 * (beta2 - beta1) * (EXP(-t / beta3) - EXP(-T / beta3)) -
beta2 * (EXP(-t / beta3) * (t + beta3) - EXP(-T / beta3) * (T + beta3)) -
beta3 * (beta2 - beta1) * EXP(-hatar * T) * (EXP(-t * (oneobeta2mhatar)) - EXP(-T * (oneobeta2mhatar))) / (oneobeta2mhatar) +
beta2 / beta3 * EXP(-hatar * T) * (EXP(-t * (oneobeta2mhatar)) * (t / (oneobeta2mhatar) + 1 / Sqr(oneobeta2mhatar)) -
EXP(-T * (oneobeta2mhatar)) * (T / (oneobeta2mhatar) + 1 / Sqr(oneobeta2mhatar))
)
);
const real A_52 = beta0 * (T - t) + beta1 * beta3 * (EXP(-t / beta3) - EXP(-T / beta3))
+ beta2 * (EXP(-t / beta3) * (t + beta3) -
EXP(-T / beta3) * (T + beta3)
)
+ beta0 / hatar * (1 - EXP(-hatar * (T - t)))
+ beta1 * beta3 / (1 - hatar * beta3) * EXP(-hatar * T) * (EXP(-t * (oneobeta2mhatar)) - EXP(-T * (oneobeta2mhatar)))
+ beta2 / beta3 * EXP(-hatar * T) *
(EXP(-t * (oneobeta2mhatar)) * (t / (oneobeta2mhatar) + 1 / Sqr(oneobeta2mhatar)) -
EXP(-T * (oneobeta2mhatar)) * (T / (oneobeta2mhatar) + 1 / Sqr(oneobeta2mhatar))
);
const real A_53 = Sqr(sigma_r / hatar) / 2 * (T - t - 2 / hatar * (EXP(-hatar * t) - EXP(-hatar * T)) + 1 / (2 * hatar) * (EXP(-2 * hatar * t) - EXP(-2 * hatar * T)))
+ (sigma_w * b_rw / hatar) / 2 * (1 / Sqr(hataw) * (T - t - 2 / hataw * (EXP(-hataw * t) - EXP(-hataw * T)) + 1 / (2 * hataw) * (EXP(-2 * hataw * t) - EXP(-2 * hataw * T))) +
2 / (hataw * (hataw - hatar)) * (1 / hataw * (EXP(-hataw * t) - EXP(-hataw * T)) - 1 / hatar * (EXP(-hatar * t) - EXP(-hatar * T)) -
1 / (2 * hataw) * (EXP(-2 * hataw * t) - EXP(-2 * hataw * T)) + 1 / (hataw + hatar) * (EXP(-(hataw + hatar) * t) - EXP(-(hataw + hatar) * T))
) +
1 / Sqr(hataw - hatar) * (1 / (2 * hataw) * (EXP(-2 * hataw * t) - EXP(-2 * hataw * T)) -
2 / (hataw + hatar) * (EXP(-(hataw + hatar) * t) - EXP(-(hataw + hatar) * T)) +
1 / (2 * hatar) * (EXP(-2 * hatar * t) - EXP(-2 * hatar * T))
)
)
+ Sqr(sigma_r / hatar) / 2 * EXP(-hatar * T) * (1 / hatar * (EXP(hatar * T) - EXP(hatar * t)) + 1 / hatar * (EXP(-hatar * t) - EXP(-hatar * T)) - 2 * (T - t))
+ Sqr(sigma_w * b_rw / hatar) / 2 * EXP(-hatar * T) *
(
(1 / Sqr(hataw) * (1 / hatar * (EXP(hatar * T) - EXP(hatar * t)) -
2 / (hataw - hatar) * (EXP(-(hataw - hatar) * t) - EXP(-(hataw - hatar) * T)) +
1 / (2 * hataw - hatar) * (EXP(-(2 * hataw - hatar) * t) - EXP(-(2 * hataw - hatar) * T))
)
) +
2 / (hataw * (hataw - hatar)) * (1 / (hataw - hatar) * (EXP(-t * (hataw - hatar)) - EXP(-T * (hataw - hatar))) -
(T - t) -
1 / (2 * hataw - hatar) * (EXP(-t * (2 * hataw - hatar)) - EXP(-T * (2 * hataw - hatar))) +
1 / hataw * (EXP(-hataw * t) - EXP(-hataw * T))
) +
1 / (Sqr(hataw - hatar)) * (1 / (2 * hataw - hatar) * (EXP(-t * (2 * hataw - hatar)) - EXP(-T * (2 * hataw - hatar))) -
2 / hataw * (EXP(-hataw * t) - EXP(-hataw * T)) +
1 / hatar * (EXP(-hatar * t) - EXP(-hatar * T))
)
);
const real A_54 = 1 / hatar * (Sqr(sigma_r) / hatar * (1 / hatar * (EXP(-hatar * t) - EXP(-hatar * T)) - 1 / (2 * hatar) * (EXP(-2 * hatar * t) - EXP(-2 * hatar * T))))
+ 1 / hatar * (Sqr(sigma_w * b_rw) / (hatar * hataw * (hatar - hataw)) *
(1 / hataw * (EXP(-hataw * t) - EXP(-hataw * T)) - 1 / (2 * hataw) * (EXP(-2 * hataw * t) - EXP(-2 * hataw * T)) -
1 / hatar * (EXP(-hatar * t) - EXP(-hatar * T)) + 1 / (hatar + hataw) * (EXP(-(hatar + hataw) * t) - EXP(-(hatar + hataw) * T))
))
+ 1 / hatar * (Sqr(sigma_w * b_rw) / (hatar * (hatar - hataw) * (hataw - hatar)) * (1 / (2 * hataw) * (EXP(-2 * hataw * t) - EXP(-2 * hataw * T)) + 1 / (2 * hatar) * (EXP(-2 * hatar * t) - EXP(-2 * hatar * T)) -
2 / (hataw + hatar) * (EXP(-(hatar + hataw) * t) - EXP(-(hatar + hataw) * T))))
+ 1 / hatar * (Sqr(sigma_r) / hatar * EXP(-hatar * T) * (T - t - 1 / hatar * (EXP(-hatar * t) - EXP(-hatar * T))))
+ 1 / hatar * (Sqr(sigma_w * b_rw) / (hatar * (hatar - hataw)) * EXP(-hatar * T) *
(1 / hataw * (1 / (hataw - hatar) * (EXP(-(hataw - hatar) * t) - EXP(-(hataw - hatar) * T)) -
1 / (2 * hataw - hatar) * (EXP(-(2 * hataw - hatar) * t) - EXP(-(2 * hataw - hatar) * T)) -
(T - t) + 1 / hataw * (EXP(-hataw * t) - EXP(-hataw * T))
) +
1 / (hataw - hatar) * (1 / (2 * hataw - hatar) * (EXP(-(2 * hataw - hatar) * t) - EXP(-(2 * hataw - hatar) * T)) -
2 / hataw * (EXP(-hataw * t) - EXP(-hataw * T)) +
1 / hatar * (EXP(-hatar * t) - EXP(-hatar * T))
)
)
);
const real A_55 = b_rw * theta_w / (hataw * hatar) * (T - t - 1 / hataw * (EXP(-hataw * t) - EXP(-hataw * T)) - 1 / hatar * (1 - EXP(-hatar * (T - t))) +
EXP(-hatar * T) / (hataw - hatar) * (EXP(-t * (hataw - hatar)) - EXP(-T * (hataw - hatar))));
const real A_56 = b_rw * start_w / hatar * (1 / hataw * (EXP(-hataw * t) - EXP(-hataw * T)) -
EXP(-hatar * T) * 1 / (hataw - hatar) * (EXP(-t * (hataw - hatar)) - EXP(-T * (hataw - hatar))));
return A_51 + A_52 + A_53 + A_54 - A_55 - A_56;
}
PS : by the way, the real stands for float and EXP stands for expf.