i am doing something wrong, missing something, or a combination of the 2…
firstly, noinline does nothing whatsoever to the register usage
secondly, this is the kernel as shell, stripped of all the device functions:
__global__ void _fwd_krnl(bool in_have_ref_case, bool gbl_pnts_rem,
unsigned int cur_krnl_n, unsigned int comb_arr_size, unsigned int iss_mul_comb_arr_size,
unsigned int comb_pnt_n, unsigned int tot_block_bnd, unsigned int gbl_pnt_cnt,
unsigned int block_pnt_cnt_trans_pnt, unsigned int block_pnt_cnt_trans_pnt_block_n,
unsigned int block_pnt_cntA, unsigned int block_pnt_cntB,
double in_ref_case, double port_val, double max_fwd, double comb_res,
bool* d_in_pnt, bool* d_block_bnd_has_range, bool* d_block_posted_ref_case, bool* d_gbl_posted_ref_case,
bool* d_gbl_set_ref, bool* d_stop_exec,
char* d_gbl_sol_overflow_type, char* d_block_stat,
unsigned int* d_pnts_cnt, unsigned int* d_rem_in_val, unsigned int* d_in_pnt_val,
unsigned int* d_port_pnt_iss_mul_data, unsigned int* d_iss_mul_data, unsigned int* d_iss_port_pnt_in_arr,
unsigned int* d_1st_ref_upd, unsigned int* d_gbl_block_sol_cnt, unsigned int* d_terminate,
double* d_fwd, double* d_comb, double* d_start_pnt_range_data, double* d_port_pnt_time,
double* d_port_pnt_time_diff, double* d_block_start_time_diff_rec, double* d_iss_cpn_in_arr,
double* d_iss_val, double* d_iss_comb_sum, double* d_port_pnt_mul_sum, double* d_gbl_ref_case,
double* d_block_ref_case, double* d_best_result_stack, double* d_sol_stack, double* d_worst_ref_case,
double* d_result_upd, double* d_split_comb, double* d_split_fwd, double* d_split_work_data)
{
__shared__ bool break_for_overhead;
__shared__ bool must_do_overhead;
__shared__ bool must_terminate;
__shared__ unsigned int rerun_cnt;
__shared__ unsigned int cur_pnt_port_pnt[3];
__shared__ double cur_pnt_fwd[3];
__shared__ double start_pnt_range_data[start_pnt_range_data_length];
__shared__ Fwd_krnl_data fwd_krnl_data;
if (threadIdx.x < start_pnt_range_data_length)
{
start_pnt_range_data[threadIdx.x] = d_start_pnt_range_data[threadIdx.x];
}
__syncthreads();
}
and this the ptxas information for it, followed by the original kernel’s information:
ptxas info : Compiling entry function ‘Z9_fwd_krnlbbjjjjjjjjjjddddPbS_S_S_S_S_PcS0_PjS1_S1_S1_S1_S1_S1_S1_S1_PdS2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2’ for ‘sm_35’
ptxas info : Function properties for Z9_fwd_krnlbbjjjjjjjjjjddddPbS_S_S_S_S_PcS0_PjS1_S1_S1_S1_S1_S1_S1_S1_PdS2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2
360 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info : Used 97 registers, 16 bytes smem, 688 bytes cmem[0]
ptxas info : Compiling entry function ‘Z8fwd_krnlbbjjjjjjjjjjddddPbS_S_S_S_S_PcS0_PjS1_S1_S1_S1_S1_S1_S1_S1_PdS2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2’ for ‘sm_35’
ptxas info : Function properties for Z8fwd_krnlbbjjjjjjjjjjddddPbS_S_S_S_S_PcS0_PjS1_S1_S1_S1_S1_S1_S1_S1_PdS2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2
0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info : Used 180 registers, 507 bytes smem, 688 bytes cmem[0]
the shell itself is already at 97 registers…?