problem of register & automatic variable

Hi, everyone

first look at my code:[codebox]global void item_Count(int *dk_buffer, …)


extern __shared__ int tran_buffer[];

extern __shared__ int item_cnt[]; // item_cnt[3*blockDim.x]

int* item_id = &item_cnt[blockDim.x];

int* item_cnt_rec = &item_id[blockDim.x];

unsigned int item_idx = blockIdx.x * blockDim.x + threadIdx.x;

unsigned int count = 0, address = 0, tran_begin = 0, tran_sz;

unsigned long i;

unsigned int j, index, index_end;

for(i = 0; i < 232; i++) {

    tran_buffer[threadIdx.x] = dk_buffer[address + threadIdx.x];

    tran_buffer[blockDim.x+threadIdx.x] = dk_buffer[address + blockDim.x + threadIdx.x];


    for(j = 0; j < 512; j++)

        if(tran_buffer[j] == item_idx) count++;

    address += blockDim.x<<1;


// take time of 0.427968 ms

item_cnt[threadIdx.x] = 0; //count;

item_id[threadIdx.x] = item_idx;

item_cnt_rec[threadIdx.x] = 0;


// take time of 13.629280 ms when assign count to item_cnt[threadIdx.x]

// take time of 0.431072 ms when assign zero to item_cnt[threadIdx.x]


kernel configuration: blockDim =dim(128, 1), gridDim =dim(8, 1);

The times i got by commenting subsequential code show that assigning the automatic variable count to item_cnt[threadIdx.x] caused a terrible performance decline, which should not happen because it is just an automatic variable, and i think the variable must not be relocated in local memory. Even it is placed in local memory, there will not be a huge performance drop in terms of a accesses coalescence.

after more test, i found that variables like count, which is much more frequently modified, will have the problem that access to them is very expensive

It seem a problem of register & automatic variable, but i can’t figure it out, so hope someone can give me some advices

Thanks in advance

someone told me that may be a problem of optimization, so i tried to assign all automatic variables in this function to item_cnt[threadIdx.x], no matter used or unused. the test show that all variables take less than 0.5 ms to execute, except that ‘i’ take 6.402080 ms and ‘count’ take 13.629280 ms, which is frequently used and access to which seem very expensive, it ought to be a problem of nvcc optimization…

if you have encountered such a problem, if you know cuda compiler well, especially about optimization, if you know how to handle my problem, please give me a hand, i will very appreciate your help!


ptx code of this function is as following:

[codebox].entry _Z29get_1itemset_Count621_reduce2iiPiP7ITEMCNTiS_i


.reg .u16 $rh1,$rh2;

.reg .u32 $r1,$r2,$r3,$r4,$r5,$r6,$r7,$r8,$r9,




.reg .pred $p0,$p1,$p2,$p3;

.param .s32 __cudaparm__Z29get_1itemset_Count621_reduce2iiPiP7ITEMCNTiS_


.param .s32 __cudaparm__Z29get_1itemset_Count621_reduce2iiPiP7ITEMCNTiS_


.param .u32 __cudaparm__Z29get_1itemset_Count621_reduce2iiPiP7ITEMCNTiS_


.param .u32 __cudaparm__Z29get_1itemset_Count621_reduce2iiPiP7ITEMCNTiS_


.param .s32 __cudaparm__Z29get_1itemset_Count621_reduce2iiPiP7ITEMCNTiS_


.param .u32 __cudaparm__Z29get_1itemset_Count621_reduce2iiPiP7ITEMCNTiS_


.param .s32 __cudaparm__Z29get_1itemset_Count621_reduce2iiPiP7ITEMCNTiS_


.loc	2	664	0



mov.u32 	$r1, tran_buffer;    	//  

cvt.u32.u16 	$r2, %ntid.x;    	//  

cvt.u32.u16 	$r3, %ctaid.x;   	//  

mul.lo.u32 	$r4, $r3, $r2;    	//  

cvt.u32.u16 	$r5, %tid.x;     	//  

add.u32 	$r6, $r4, $r5;       	//  

mul24.lo.u32 	$r7, $r5, 4;    	//  

add.u32 	$r8, $r5, $r2;       	//  

shl.b32 	$r9, $r2, 1;         	//  

mul.lo.u32 	$r10, $r9, 4;     	//  

add.u32 	$r11, $r7, $r1;      	//  

mul.lo.u32 	$r12, $r8, 4;     	//  

add.u32 	$r13, $r1, 2048;     	//  

ld.param.u32 	$r14, [__cudaparm__Z29get_1itemset_Count621_reduce2iiPiP7ITEMCNTiS

_i_dk_buffer]; // id:61 _cudaparm__Z29get_1itemset_Count621_reduce2iiPiP7ITEMCNTiS


add.u32 	$r15, $r7, $r14;     	//  

add.u32 	$r16, $r12, $r1;     	//  

add.u32 	$r17, $r12, $r14;    	//  

mov.u16 	$rh1, 0;             	//  

mov.u32 	$r18, 0;             	//  


// Loop body line 664, nesting depth: 1, iterations: 232

.loc	2	682	0 	$r19, [$r15+0];	//  id:62

st.shared.s32 	[$r11+0], $r19;	//  id:63 tran_buffer+0x0

.loc	2	683	0 	$r20, [$r17+0];	//  id:64

st.shared.s32 	[$r16+0], $r20;	//  id:65 tran_buffer+0x0

.loc	2	684	0

bar.sync 	0;                  	//  

mov.u32 	$r21, tran_buffer;   	//  


// Loop body line 684, nesting depth: 2, iterations: 512

ld.shared.u32 	$r22, [$r21+0];	//  id:66 tran_buffer+0x0 	$p1, $r22, $r6;  	//  

@$p1 bra 	$Lt_12_18;          	//  

// Part of loop body line 684, head labeled $Lt_12_17

.loc	2	686	0

add.u32 	$r18, $r18, 1;       	//  


// Part of loop body line 684, head labeled $Lt_12_17

add.u32 	$r21, $r21, 4;       	// 	$p2, $r21, $r13; 	//  

@$p2 bra 	$Lt_12_17;          	//  

// Part of loop body line 664, head labeled $Lt_12_14

.loc	2	687	0

add.u32 	$r17, $r17, $r10;    	//  

add.u32 	$r15, $r15, $r10;    	//  

add.u16 	$rh1, $rh1, 1;       	//  

mov.s16 	$rh2, 232;           	// 	$p3, $rh1, $rh2; 	//  

@$p3 bra 	$Lt_12_14;          	//  

.loc	2	718	0

mov.u32 	$r23, item_cnt;      	//  

add.u32 	$r24, $r7, $r23;     	//  

st.shared.s32 	[$r24+0], $r18;	//  id:67 item_cnt+0x0

.loc	2	719	0

mul24.lo.u32 	$r25, $r2, 4;   	//  

add.u32 	$r26, $r25, $r23;    	//  

add.u32 	$r27, $r26, $r7;     	//  

st.shared.s32 	[$r27+0], $r6; 	//  id:68 item_cnt+0x0

.loc	2	720	0

mov.s32 	$r28, 0;             	//  

add.u32 	$r29, $r25, $r26;    	//  

add.u32 	$r30, $r7, $r29;     	//  

st.shared.s32 	[$r30+0], $r28;	//  id:69 item_cnt+0x0

.loc	2	721	0

bar.sync 	0;                  	//  

exit;                         	//  

} // _Z29get_1itemset_Count621_reduce2iiPiP7ITEMCNTiS_i[/codebox]

i did some test again, the time i got shows that the more automatic variables are used, the more expensive access to them are. guys who are interesting about this problem can have a look, which i post on general cuda GPU computing discussion,link is as following: [topic=“83868”]seem a problem of register[/topic]

one thing that catches my eye is that you have declared 2 dynamic arrays within 1 kernel which should start

from the same memory address, according to the programming guide.

So I would check this first, maybe this causes your algorithm to hang…

i reuse the same region of shared memory by declaring 2 dynamic arrays which have the same beginning address, it should be all right, because there is the same problem when i declare one dynamic array in another function, here is the code snippet

[codebox]extern shared int itemset;

int* tran_buffer = &itemset[itemset_len*blockDim.x];

int* item_cnt = &itemset[itemset_len*blockDim.x];

int* item_id = &item_cnt[blockDim.x];

int* item_cnt_rec = &item_id[blockDim.x];


there is a code for testing in [topic=“83868”]seem a problem of register[/topic], you can simply run it on youre device, the problem i encountered rerepresent in the simple function