high register count my kernel uses too many registers

I have a kernel thet calls the function below. when I compile it under CUDA 1.1 with the option --ptxas-options=-v, I see that my kernel uses 96 registers while when I compile it under CUDA 2.0 I get a register count of 60. I find this number too high especially that when I use the occupancy calculator I see that the occupancy stays very low. can someone tell me why such a kernel would be so inefficient knowing that it has zero explicit global memory access (maybe there are local memory accesses after compilation though) and how I can reduce the register count ?

thank you for your help.
Ali

device
inline real myFunc(const real t, const real T, const real hatar, const real hataw, const real b_rw, const real sigma_r, const real sigma_w,
const real theta_w, const real start_w, const real beta0, const real beta1, const real beta2, const real beta3)
{
const real oneobeta2mhatar = 1 / beta3 - hatar;

const real A_51 = 1 / (beta3 * hatar) *
(beta3 * (beta2 - beta1) * (EXP(-t / beta3) - EXP(-T / beta3)) -
beta2 * (EXP(-t / beta3) * (t + beta3) - EXP(-T / beta3) * (T + beta3)) -
beta3 * (beta2 - beta1) * EXP(-hatar * T) * (EXP(-t * (oneobeta2mhatar)) - EXP(-T * (oneobeta2mhatar))) / (oneobeta2mhatar) +
beta2 / beta3 * EXP(-hatar * T) * (EXP(-t * (oneobeta2mhatar)) * (t / (oneobeta2mhatar) + 1 / Sqr(oneobeta2mhatar)) -
EXP(-T * (oneobeta2mhatar)) * (T / (oneobeta2mhatar) + 1 / Sqr(oneobeta2mhatar))
)
);

const real A_52 = beta0 * (T - t) + beta1 * beta3 * (EXP(-t / beta3) - EXP(-T / beta3))
+ beta2 * (EXP(-t / beta3) * (t + beta3) -
EXP(-T / beta3) * (T + beta3)
)
+ beta0 / hatar * (1 - EXP(-hatar * (T - t)))
+ beta1 * beta3 / (1 - hatar * beta3) * EXP(-hatar * T) * (EXP(-t * (oneobeta2mhatar)) - EXP(-T * (oneobeta2mhatar)))
+ beta2 / beta3 * EXP(-hatar * T) *
(EXP(-t * (oneobeta2mhatar)) * (t / (oneobeta2mhatar) + 1 / Sqr(oneobeta2mhatar)) -
EXP(-T * (oneobeta2mhatar)) * (T / (oneobeta2mhatar) + 1 / Sqr(oneobeta2mhatar))
);

const real A_53 = Sqr(sigma_r / hatar) / 2 * (T - t - 2 / hatar * (EXP(-hatar * t) - EXP(-hatar * T)) + 1 / (2 * hatar) * (EXP(-2 * hatar * t) - EXP(-2 * hatar * T)))
+ (sigma_w * b_rw / hatar) / 2 * (1 / Sqr(hataw) * (T - t - 2 / hataw * (EXP(-hataw * t) - EXP(-hataw * T)) + 1 / (2 * hataw) * (EXP(-2 * hataw * t) - EXP(-2 * hataw * T))) +
2 / (hataw * (hataw - hatar)) * (1 / hataw * (EXP(-hataw * t) - EXP(-hataw * T)) - 1 / hatar * (EXP(-hatar * t) - EXP(-hatar * T)) -
1 / (2 * hataw) * (EXP(-2 * hataw * t) - EXP(-2 * hataw * T)) + 1 / (hataw + hatar) * (EXP(-(hataw + hatar) * t) - EXP(-(hataw + hatar) * T))
) +
1 / Sqr(hataw - hatar) * (1 / (2 * hataw) * (EXP(-2 * hataw * t) - EXP(-2 * hataw * T)) -
2 / (hataw + hatar) * (EXP(-(hataw + hatar) * t) - EXP(-(hataw + hatar) * T)) +
1 / (2 * hatar) * (EXP(-2 * hatar * t) - EXP(-2 * hatar * T))
)
)
+ Sqr(sigma_r / hatar) / 2 * EXP(-hatar * T) * (1 / hatar * (EXP(hatar * T) - EXP(hatar * t)) + 1 / hatar * (EXP(-hatar * t) - EXP(-hatar * T)) - 2 * (T - t))
+ Sqr(sigma_w * b_rw / hatar) / 2 * EXP(-hatar * T) *
(
(1 / Sqr(hataw) * (1 / hatar * (EXP(hatar * T) - EXP(hatar * t)) -
2 / (hataw - hatar) * (EXP(-(hataw - hatar) * t) - EXP(-(hataw - hatar) * T)) +
1 / (2 * hataw - hatar) * (EXP(-(2 * hataw - hatar) * t) - EXP(-(2 * hataw - hatar) * T))
)
) +
2 / (hataw * (hataw - hatar)) * (1 / (hataw - hatar) * (EXP(-t * (hataw - hatar)) - EXP(-T * (hataw - hatar))) -
(T - t) -
1 / (2 * hataw - hatar) * (EXP(-t * (2 * hataw - hatar)) - EXP(-T * (2 * hataw - hatar))) +
1 / hataw * (EXP(-hataw * t) - EXP(-hataw * T))
) +
1 / (Sqr(hataw - hatar)) * (1 / (2 * hataw - hatar) * (EXP(-t * (2 * hataw - hatar)) - EXP(-T * (2 * hataw - hatar))) -
2 / hataw * (EXP(-hataw * t) - EXP(-hataw * T)) +
1 / hatar * (EXP(-hatar * t) - EXP(-hatar * T))
)
);

const real A_54 = 1 / hatar * (Sqr(sigma_r) / hatar * (1 / hatar * (EXP(-hatar * t) - EXP(-hatar * T)) - 1 / (2 * hatar) * (EXP(-2 * hatar * t) - EXP(-2 * hatar * T))))
+ 1 / hatar * (Sqr(sigma_w * b_rw) / (hatar * hataw * (hatar - hataw)) *
(1 / hataw * (EXP(-hataw * t) - EXP(-hataw * T)) - 1 / (2 * hataw) * (EXP(-2 * hataw * t) - EXP(-2 * hataw * T)) -
1 / hatar * (EXP(-hatar * t) - EXP(-hatar * T)) + 1 / (hatar + hataw) * (EXP(-(hatar + hataw) * t) - EXP(-(hatar + hataw) * T))
))
+ 1 / hatar * (Sqr(sigma_w * b_rw) / (hatar * (hatar - hataw) * (hataw - hatar)) * (1 / (2 * hataw) * (EXP(-2 * hataw * t) - EXP(-2 * hataw * T)) + 1 / (2 * hatar) * (EXP(-2 * hatar * t) - EXP(-2 * hatar * T)) -
2 / (hataw + hatar) * (EXP(-(hatar + hataw) * t) - EXP(-(hatar + hataw) * T))))
+ 1 / hatar * (Sqr(sigma_r) / hatar * EXP(-hatar * T) * (T - t - 1 / hatar * (EXP(-hatar * t) - EXP(-hatar * T))))
+ 1 / hatar * (Sqr(sigma_w * b_rw) / (hatar * (hatar - hataw)) * EXP(-hatar * T) *
(1 / hataw * (1 / (hataw - hatar) * (EXP(-(hataw - hatar) * t) - EXP(-(hataw - hatar) * T)) -
1 / (2 * hataw - hatar) * (EXP(-(2 * hataw - hatar) * t) - EXP(-(2 * hataw - hatar) * T)) -
(T - t) + 1 / hataw * (EXP(-hataw * t) - EXP(-hataw * T))
) +
1 / (hataw - hatar) * (1 / (2 * hataw - hatar) * (EXP(-(2 * hataw - hatar) * t) - EXP(-(2 * hataw - hatar) * T)) -
2 / hataw * (EXP(-hataw * t) - EXP(-hataw * T)) +
1 / hatar * (EXP(-hatar * t) - EXP(-hatar * T))
)
)
);

const real A_55 = b_rw * theta_w / (hataw * hatar) * (T - t - 1 / hataw * (EXP(-hataw * t) - EXP(-hataw * T)) - 1 / hatar * (1 - EXP(-hatar * (T - t))) +
EXP(-hatar * T) / (hataw - hatar) * (EXP(-t * (hataw - hatar)) - EXP(-T * (hataw - hatar))));

const real A_56 = b_rw * start_w / hatar * (1 / hataw * (EXP(-hataw * t) - EXP(-hataw * T)) -
EXP(-hatar * T) * 1 / (hataw - hatar) * (EXP(-t * (hataw - hatar)) - EXP(-T * (hataw - hatar))));

return A_51 + A_52 + A_53 + A_54 - A_55 - A_56;

}

PS : by the way, the real stands for float and EXP stands for expf.

I’m hardly an expert on this, but that’s never stopped me from opening my mouth before. Glancing at your code, you have some massive mathematical expressions with lots of nested operations. Unless those parentheses are all superfluous, the processor needs to explicitly calculate many intermediate values in the process of evaluating the total expression. These intermediate values are presumably all stored in registers. If you look at the ptx code, you may be able to determine whether this is indeed the case. Keep in mind that function like expf are not natively implemented in the hardware and themselves expand to blocks of code which may require additional registers.

As a side note, it’s also unclear to me whether your constants like “2” will need to be explicitly converted to floats at execution time, since you do not specify them as floats in your code. It might be safer to write 2.0f rather than 2.

Best of luck,
Jason

Try putting often needed expressions like e.g. hataw - hatar into registers. This can be achieved by assigning local variables (declared volatile) like this.

volatile float hataw_minus_hatar = hataw - hatar.

Then replace occurences of hataw - hatar with hataw_minus_hatar.

I found this can also work with constants e.g.

volatile float two = 2.0f;

and replace all occurrences of 2 with two.

Now this may look counterintuitive, but in some of my code tricks like these have helped reducing the register count.

Depending on the required accuracy of the expf function and the range of input values, you might also want to try generating the expf result with a 1D texture lookup. Alternatively check if there is a lower precision __expf function that executes on hardware.

Christian

Thank you all for your answers. I was hoping no to be asked to split the integrals even further but there seems to be no other choice. Thank you all for your answers. I’ll try what you proposed and get back to you with the results.

cheers,
Ali

Another thing i might look at is the massive parameter passing to the function.

I saw in my code that passing many variables while calling the kernel many times (a loop of kernel calls) impose a great performance toll.

what would be a good alternative to passing many arguments in your opinion? constant memory could be one solution, however I am not sure its performance.

you can create a device memory pointer (a structured data) in your kernel of which you will allocate memory once for it using the cudamalloc , and calling cudamemcpy for each time you need to change the data.

I dont know it that will work, but if you must call the kernel function many times, i would suggest to make this area big enough so that you would do one memcpy for many function calls - that is prepare your call paramaters in advance.

But,

The best thing is to call it once and have the kernel do the needed computations.

and this is what i did to fix my problem.