high register count my kernel uses too many registers

abdelali_zahi · August 18, 2008, 11:25am

I have a kernel thet calls the function below. when I compile it under CUDA 1.1 with the option --ptxas-options=-v, I see that my kernel uses 96 registers while when I compile it under CUDA 2.0 I get a register count of 60. I find this number too high especially that when I use the occupancy calculator I see that the occupancy stays very low. can someone tell me why such a kernel would be so inefficient knowing that it has zero explicit global memory access (maybe there are local memory accesses after compilation though) and how I can reduce the register count ?

thank you for your help.
Ali

device
inline real myFunc(const real t, const real T, const real hatar, const real hataw, const real b_rw, const real sigma_r, const real sigma_w,
const real theta_w, const real start_w, const real beta0, const real beta1, const real beta2, const real beta3)
{
const real oneobeta2mhatar = 1 / beta3 - hatar;

const real A_51 = 1 / (beta3 * hatar) *
(beta3 * (beta2 - beta1) * (EXP(-t / beta3) - EXP(-T / beta3)) -
beta2 * (EXP(-t / beta3) * (t + beta3) - EXP(-T / beta3) * (T + beta3)) -
beta3 * (beta2 - beta1) * EXP(-hatar * T) * (EXP(-t * (oneobeta2mhatar)) - EXP(-T * (oneobeta2mhatar))) / (oneobeta2mhatar) +
beta2 / beta3 * EXP(-hatar * T) * (EXP(-t * (oneobeta2mhatar)) * (t / (oneobeta2mhatar) + 1 / Sqr(oneobeta2mhatar)) -
EXP(-T * (oneobeta2mhatar)) * (T / (oneobeta2mhatar) + 1 / Sqr(oneobeta2mhatar))
)
);

const real A_52 = beta0 * (T - t) + beta1 * beta3 * (EXP(-t / beta3) - EXP(-T / beta3))
+ beta2 * (EXP(-t / beta3) * (t + beta3) -
EXP(-T / beta3) * (T + beta3)
)
+ beta0 / hatar * (1 - EXP(-hatar * (T - t)))
+ beta1 * beta3 / (1 - hatar * beta3) * EXP(-hatar * T) * (EXP(-t * (oneobeta2mhatar)) - EXP(-T * (oneobeta2mhatar)))
+ beta2 / beta3 * EXP(-hatar * T) *
(EXP(-t * (oneobeta2mhatar)) * (t / (oneobeta2mhatar) + 1 / Sqr(oneobeta2mhatar)) -
EXP(-T * (oneobeta2mhatar)) * (T / (oneobeta2mhatar) + 1 / Sqr(oneobeta2mhatar))
);

const real A_53 = Sqr(sigma_r / hatar) / 2 * (T - t - 2 / hatar * (EXP(-hatar * t) - EXP(-hatar * T)) + 1 / (2 * hatar) * (EXP(-2 * hatar * t) - EXP(-2 * hatar * T)))
+ (sigma_w * b_rw / hatar) / 2 * (1 / Sqr(hataw) * (T - t - 2 / hataw * (EXP(-hataw * t) - EXP(-hataw * T)) + 1 / (2 * hataw) * (EXP(-2 * hataw * t) - EXP(-2 * hataw * T))) +
2 / (hataw * (hataw - hatar)) * (1 / hataw * (EXP(-hataw * t) - EXP(-hataw * T)) - 1 / hatar * (EXP(-hatar * t) - EXP(-hatar * T)) -
1 / (2 * hataw) * (EXP(-2 * hataw * t) - EXP(-2 * hataw * T)) + 1 / (hataw + hatar) * (EXP(-(hataw + hatar) * t) - EXP(-(hataw + hatar) * T))
) +
1 / Sqr(hataw - hatar) * (1 / (2 * hataw) * (EXP(-2 * hataw * t) - EXP(-2 * hataw * T)) -
2 / (hataw + hatar) * (EXP(-(hataw + hatar) * t) - EXP(-(hataw + hatar) * T)) +
1 / (2 * hatar) * (EXP(-2 * hatar * t) - EXP(-2 * hatar * T))
)
)
+ Sqr(sigma_r / hatar) / 2 * EXP(-hatar * T) * (1 / hatar * (EXP(hatar * T) - EXP(hatar * t)) + 1 / hatar * (EXP(-hatar * t) - EXP(-hatar * T)) - 2 * (T - t))
+ Sqr(sigma_w * b_rw / hatar) / 2 * EXP(-hatar * T) *
(
(1 / Sqr(hataw) * (1 / hatar * (EXP(hatar * T) - EXP(hatar * t)) -
2 / (hataw - hatar) * (EXP(-(hataw - hatar) * t) - EXP(-(hataw - hatar) * T)) +
1 / (2 * hataw - hatar) * (EXP(-(2 * hataw - hatar) * t) - EXP(-(2 * hataw - hatar) * T))
)
) +
2 / (hataw * (hataw - hatar)) * (1 / (hataw - hatar) * (EXP(-t * (hataw - hatar)) - EXP(-T * (hataw - hatar))) -
(T - t) -
1 / (2 * hataw - hatar) * (EXP(-t * (2 * hataw - hatar)) - EXP(-T * (2 * hataw - hatar))) +
1 / hataw * (EXP(-hataw * t) - EXP(-hataw * T))
) +
1 / (Sqr(hataw - hatar)) * (1 / (2 * hataw - hatar) * (EXP(-t * (2 * hataw - hatar)) - EXP(-T * (2 * hataw - hatar))) -
2 / hataw * (EXP(-hataw * t) - EXP(-hataw * T)) +
1 / hatar * (EXP(-hatar * t) - EXP(-hatar * T))
)
);

const real A_54 = 1 / hatar * (Sqr(sigma_r) / hatar * (1 / hatar * (EXP(-hatar * t) - EXP(-hatar * T)) - 1 / (2 * hatar) * (EXP(-2 * hatar * t) - EXP(-2 * hatar * T))))
+ 1 / hatar * (Sqr(sigma_w * b_rw) / (hatar * hataw * (hatar - hataw)) *
(1 / hataw * (EXP(-hataw * t) - EXP(-hataw * T)) - 1 / (2 * hataw) * (EXP(-2 * hataw * t) - EXP(-2 * hataw * T)) -
1 / hatar * (EXP(-hatar * t) - EXP(-hatar * T)) + 1 / (hatar + hataw) * (EXP(-(hatar + hataw) * t) - EXP(-(hatar + hataw) * T))
))
+ 1 / hatar * (Sqr(sigma_w * b_rw) / (hatar * (hatar - hataw) * (hataw - hatar)) * (1 / (2 * hataw) * (EXP(-2 * hataw * t) - EXP(-2 * hataw * T)) + 1 / (2 * hatar) * (EXP(-2 * hatar * t) - EXP(-2 * hatar * T)) -
2 / (hataw + hatar) * (EXP(-(hatar + hataw) * t) - EXP(-(hatar + hataw) * T))))
+ 1 / hatar * (Sqr(sigma_r) / hatar * EXP(-hatar * T) * (T - t - 1 / hatar * (EXP(-hatar * t) - EXP(-hatar * T))))
+ 1 / hatar * (Sqr(sigma_w * b_rw) / (hatar * (hatar - hataw)) * EXP(-hatar * T) *
(1 / hataw * (1 / (hataw - hatar) * (EXP(-(hataw - hatar) * t) - EXP(-(hataw - hatar) * T)) -
1 / (2 * hataw - hatar) * (EXP(-(2 * hataw - hatar) * t) - EXP(-(2 * hataw - hatar) * T)) -
(T - t) + 1 / hataw * (EXP(-hataw * t) - EXP(-hataw * T))
) +
1 / (hataw - hatar) * (1 / (2 * hataw - hatar) * (EXP(-(2 * hataw - hatar) * t) - EXP(-(2 * hataw - hatar) * T)) -
2 / hataw * (EXP(-hataw * t) - EXP(-hataw * T)) +
1 / hatar * (EXP(-hatar * t) - EXP(-hatar * T))
)
)
);

const real A_55 = b_rw * theta_w / (hataw * hatar) * (T - t - 1 / hataw * (EXP(-hataw * t) - EXP(-hataw * T)) - 1 / hatar * (1 - EXP(-hatar * (T - t))) +
EXP(-hatar * T) / (hataw - hatar) * (EXP(-t * (hataw - hatar)) - EXP(-T * (hataw - hatar))));

const real A_56 = b_rw * start_w / hatar * (1 / hataw * (EXP(-hataw * t) - EXP(-hataw * T)) -
EXP(-hatar * T) * 1 / (hataw - hatar) * (EXP(-t * (hataw - hatar)) - EXP(-T * (hataw - hatar))));

return A_51 + A_52 + A_53 + A_54 - A_55 - A_56;

}

PS : by the way, the real stands for float and EXP stands for expf.

rolfe · August 18, 2008, 8:42pm

I’m hardly an expert on this, but that’s never stopped me from opening my mouth before. Glancing at your code, you have some massive mathematical expressions with lots of nested operations. Unless those parentheses are all superfluous, the processor needs to explicitly calculate many intermediate values in the process of evaluating the total expression. These intermediate values are presumably all stored in registers. If you look at the ptx code, you may be able to determine whether this is indeed the case. Keep in mind that function like expf are not natively implemented in the hardware and themselves expand to blocks of code which may require additional registers.

As a side note, it’s also unclear to me whether your constants like “2” will need to be explicitly converted to floats at execution time, since you do not specify them as floats in your code. It might be safer to write 2.0f rather than 2.

Best of luck,
Jason

cbuchner1 · August 18, 2008, 8:54pm

Try putting often needed expressions like e.g. hataw - hatar into registers. This can be achieved by assigning local variables (declared volatile) like this.

volatile float hataw_minus_hatar = hataw - hatar.

Then replace occurences of hataw - hatar with hataw_minus_hatar.

I found this can also work with constants e.g.

volatile float two = 2.0f;

and replace all occurrences of 2 with two.

Now this may look counterintuitive, but in some of my code tricks like these have helped reducing the register count.

Depending on the required accuracy of the expf function and the range of input values, you might also want to try generating the expf result with a 1D texture lookup. Alternatively check if there is a lower precision __expf function that executes on hardware.

Christian

abdelali_zahi · August 19, 2008, 7:50am

Thank you all for your answers. I was hoping no to be asked to split the integrals even further but there seems to be no other choice. Thank you all for your answers. I’ll try what you proposed and get back to you with the results.

cheers,
Ali

motime01 · August 26, 2008, 8:20am

Another thing i might look at is the massive parameter passing to the function.

I saw in my code that passing many variables while calling the kernel many times (a loop of kernel calls) impose a great performance toll.

abdelali_zahi · August 26, 2008, 2:12pm

what would be a good alternative to passing many arguments in your opinion? constant memory could be one solution, however I am not sure its performance.

motime01 · August 26, 2008, 3:39pm

you can create a device memory pointer (a structured data) in your kernel of which you will allocate memory once for it using the cudamalloc , and calling cudamemcpy for each time you need to change the data.

I dont know it that will work, but if you must call the kernel function many times, i would suggest to make this area big enough so that you would do one memcpy for many function calls - that is prepare your call paramaters in advance.

But,

The best thing is to call it once and have the kernel do the needed computations.

and this is what i did to fix my problem.

Topic		Replies	Views
Weird use of registers Too many registers are wasted CUDA Programming and Performance	8	5601	July 4, 2007
Register usage for same function with different launches CUDA Programming and Performance	5	921	March 18, 2014
Problem with reducing registers CUDA Programming and Performance	6	715	June 22, 2011
how to reduce registers in each kernel CUDA Programming and Performance	2	1189	November 4, 2009
Questioning compiler's use of registers, how to get compiler to use registers more efficiently CUDA Programming and Performance	10	5333	March 30, 2009
Lowering register usage CUDA Programming and Performance	14	4727	October 10, 2008
Is it possible to use more than 124 registers in kernel? CUDA Programming and Performance	15	4304	October 16, 2009
register count explodes with CUDA 1.1 CUDA Programming and Performance	2	7362	December 12, 2007
Understanding different register counts for the same kernel CUDA Programming and Performance	3	1004	December 13, 2019
A weird usage of registers... Bad allocation with repetitive tasks CUDA Programming and Performance	2	4437	January 9, 2008

high register count my kernel uses too many registers

Related topics