I have a kernel thet calls the function below. when I compile it under CUDA 1.1 with the option --ptxas-options=-v, I see that my kernel uses 96 registers while when I compile it under CUDA 2.0 I get a register count of 60. I find this number too high especially that when I use the occupancy calculator I see that the occupancy stays very low. can someone tell me why such a kernel would be so inefficient knowing that it has zero explicit global memory access (maybe there are local memory accesses after compilation though) and how I can reduce the register count ?

thank you for your help.

Ali

**device**

inline real myFunc(const real t, const real T, const real hatar, const real hataw, const real b_rw, const real sigma_r, const real sigma_w,

const real theta_w, const real start_w, const real beta0, const real beta1, const real beta2, const real beta3)

{

const real oneobeta2mhatar = 1 / beta3 - hatar;

const real A_51 = 1 / (beta3 * hatar) *

(beta3 * (beta2 - beta1) * (EXP(-t / beta3) - EXP(-T / beta3)) -

beta2 * (EXP(-t / beta3) * (t + beta3) - EXP(-T / beta3) * (T + beta3)) -

beta3 * (beta2 - beta1) * EXP(-hatar * T) * (EXP(-t * (oneobeta2mhatar)) - EXP(-T * (oneobeta2mhatar))) / (oneobeta2mhatar) +

beta2 / beta3 * EXP(-hatar * T) * (EXP(-t * (oneobeta2mhatar)) * (t / (oneobeta2mhatar) + 1 / Sqr(oneobeta2mhatar)) -

EXP(-T * (oneobeta2mhatar)) * (T / (oneobeta2mhatar) + 1 / Sqr(oneobeta2mhatar))

)

);

const real A_52 = beta0 * (T - t) + beta1 * beta3 * (EXP(-t / beta3) - EXP(-T / beta3))

+ beta2 * (EXP(-t / beta3) * (t + beta3) -

EXP(-T / beta3) * (T + beta3)

)

+ beta0 / hatar * (1 - EXP(-hatar * (T - t)))

+ beta1 * beta3 / (1 - hatar * beta3) * EXP(-hatar * T) * (EXP(-t * (oneobeta2mhatar)) - EXP(-T * (oneobeta2mhatar)))

+ beta2 / beta3 * EXP(-hatar * T) *

(EXP(-t * (oneobeta2mhatar)) * (t / (oneobeta2mhatar) + 1 / Sqr(oneobeta2mhatar)) -

EXP(-T * (oneobeta2mhatar)) * (T / (oneobeta2mhatar) + 1 / Sqr(oneobeta2mhatar))

);

const real A_53 = Sqr(sigma_r / hatar) / 2 * (T - t - 2 / hatar * (EXP(-hatar * t) - EXP(-hatar * T)) + 1 / (2 * hatar) * (EXP(-2 * hatar * t) - EXP(-2 * hatar * T)))

+ (sigma_w * b_rw / hatar) / 2 * (1 / Sqr(hataw) * (T - t - 2 / hataw * (EXP(-hataw * t) - EXP(-hataw * T)) + 1 / (2 * hataw) * (EXP(-2 * hataw * t) - EXP(-2 * hataw * T))) +

2 / (hataw * (hataw - hatar)) * (1 / hataw * (EXP(-hataw * t) - EXP(-hataw * T)) - 1 / hatar * (EXP(-hatar * t) - EXP(-hatar * T)) -

1 / (2 * hataw) * (EXP(-2 * hataw * t) - EXP(-2 * hataw * T)) + 1 / (hataw + hatar) * (EXP(-(hataw + hatar) * t) - EXP(-(hataw + hatar) * T))

) +

1 / Sqr(hataw - hatar) * (1 / (2 * hataw) * (EXP(-2 * hataw * t) - EXP(-2 * hataw * T)) -

2 / (hataw + hatar) * (EXP(-(hataw + hatar) * t) - EXP(-(hataw + hatar) * T)) +

1 / (2 * hatar) * (EXP(-2 * hatar * t) - EXP(-2 * hatar * T))

)

)

+ Sqr(sigma_r / hatar) / 2 * EXP(-hatar * T) * (1 / hatar * (EXP(hatar * T) - EXP(hatar * t)) + 1 / hatar * (EXP(-hatar * t) - EXP(-hatar * T)) - 2 * (T - t))

+ Sqr(sigma_w * b_rw / hatar) / 2 * EXP(-hatar * T) *

(

(1 / Sqr(hataw) * (1 / hatar * (EXP(hatar * T) - EXP(hatar * t)) -

2 / (hataw - hatar) * (EXP(-(hataw - hatar) * t) - EXP(-(hataw - hatar) * T)) +

1 / (2 * hataw - hatar) * (EXP(-(2 * hataw - hatar) * t) - EXP(-(2 * hataw - hatar) * T))

)

) +

2 / (hataw * (hataw - hatar)) * (1 / (hataw - hatar) * (EXP(-t * (hataw - hatar)) - EXP(-T * (hataw - hatar))) -

(T - t) -

1 / (2 * hataw - hatar) * (EXP(-t * (2 * hataw - hatar)) - EXP(-T * (2 * hataw - hatar))) +

1 / hataw * (EXP(-hataw * t) - EXP(-hataw * T))

) +

1 / (Sqr(hataw - hatar)) * (1 / (2 * hataw - hatar) * (EXP(-t * (2 * hataw - hatar)) - EXP(-T * (2 * hataw - hatar))) -

2 / hataw * (EXP(-hataw * t) - EXP(-hataw * T)) +

1 / hatar * (EXP(-hatar * t) - EXP(-hatar * T))

)

);

const real A_54 = 1 / hatar * (Sqr(sigma_r) / hatar * (1 / hatar * (EXP(-hatar * t) - EXP(-hatar * T)) - 1 / (2 * hatar) * (EXP(-2 * hatar * t) - EXP(-2 * hatar * T))))

+ 1 / hatar * (Sqr(sigma_w * b_rw) / (hatar * hataw * (hatar - hataw)) *

(1 / hataw * (EXP(-hataw * t) - EXP(-hataw * T)) - 1 / (2 * hataw) * (EXP(-2 * hataw * t) - EXP(-2 * hataw * T)) -

1 / hatar * (EXP(-hatar * t) - EXP(-hatar * T)) + 1 / (hatar + hataw) * (EXP(-(hatar + hataw) * t) - EXP(-(hatar + hataw) * T))

))

+ 1 / hatar * (Sqr(sigma_w * b_rw) / (hatar * (hatar - hataw) * (hataw - hatar)) * (1 / (2 * hataw) * (EXP(-2 * hataw * t) - EXP(-2 * hataw * T)) + 1 / (2 * hatar) * (EXP(-2 * hatar * t) - EXP(-2 * hatar * T)) -

2 / (hataw + hatar) * (EXP(-(hatar + hataw) * t) - EXP(-(hatar + hataw) * T))))

+ 1 / hatar * (Sqr(sigma_r) / hatar * EXP(-hatar * T) * (T - t - 1 / hatar * (EXP(-hatar * t) - EXP(-hatar * T))))

+ 1 / hatar * (Sqr(sigma_w * b_rw) / (hatar * (hatar - hataw)) * EXP(-hatar * T) *

(1 / hataw * (1 / (hataw - hatar) * (EXP(-(hataw - hatar) * t) - EXP(-(hataw - hatar) * T)) -

1 / (2 * hataw - hatar) * (EXP(-(2 * hataw - hatar) * t) - EXP(-(2 * hataw - hatar) * T)) -

(T - t) + 1 / hataw * (EXP(-hataw * t) - EXP(-hataw * T))

) +

1 / (hataw - hatar) * (1 / (2 * hataw - hatar) * (EXP(-(2 * hataw - hatar) * t) - EXP(-(2 * hataw - hatar) * T)) -

2 / hataw * (EXP(-hataw * t) - EXP(-hataw * T)) +

1 / hatar * (EXP(-hatar * t) - EXP(-hatar * T))

)

)

);

const real A_55 = b_rw * theta_w / (hataw * hatar) * (T - t - 1 / hataw * (EXP(-hataw * t) - EXP(-hataw * T)) - 1 / hatar * (1 - EXP(-hatar * (T - t))) +

EXP(-hatar * T) / (hataw - hatar) * (EXP(-t * (hataw - hatar)) - EXP(-T * (hataw - hatar))));

const real A_56 = b_rw * start_w / hatar * (1 / hataw * (EXP(-hataw * t) - EXP(-hataw * T)) -

EXP(-hatar * T) * 1 / (hataw - hatar) * (EXP(-t * (hataw - hatar)) - EXP(-T * (hataw - hatar))));

return A_51 + A_52 + A_53 + A_54 - A_55 - A_56;

}

PS : by the way, the real stands for float and EXP stands for expf.