Shared memory equivalent of local memory Beyong 16k intermediate result

Is it possible to allocate per-block global memory to use as extended shared memory in future release?

When size of intermediate result may exceed 16kb and inter-thread sharing is required, we can only store it temporarily. If said “extended shared memory” is available, while performance won’t be improved, memory requirement could be reduced dramatically. These intermediate results are useless once a block is finished. But in CUDA 1.0, we can’t allocate such memory or get multiprocessor ID, and have to allocate storage for all blocks, yet only enough amount for currently concurrently running blocks is necessary.

Been waiting quite a while for an answer here (Extending shared & fixing local)…

Per-block local memory is an interesting idea, but we don’t have any plans to add this (the hardware doesn’t support it).

That’s sad to hear… Is it possible to get current block’s multiprocessor id instead? That could be used to implement ad-hoc per-block memory when no blocks are run on a single multiprocessor simultaneously.

The PTX .sreg state space has some interesting symbols %physid, %pm0, %pm1, %pm2 and %pm3. They are undocumented and I haven’t tried to find out what they hold. Wanna try?

Peter

I’ll try!

By the way, searching physid in the pdf finds something texture related…

test result:

The ptx:

.version 1.0

.target compute_10, map_f64_to_f32

//by ctc's be_ptx

.extern	.shared .align 16 .b8 sharedbase[];

.entry e011e744c_tester

{

.param .u32 p_a;

.param .s32 p_n;

.reg .u32	$r1,$r2,$r3,$r4,$r5,$r6,$r7,$r8,$r9,$r10,$r11,$r12;

.reg .pred	$p1;

cvt.u32.u16 $r1,%ctaid.x;

cvt.u32.u16 $r2,%tid.x;

mov.s32	$r4,6;

shl.b32	$r3,$r1,$r4;

add.s32	$r5,$r3,$r2;

ld.param.s32	$r6,p_n;

setp.lt.s32	$p1,$r5,$r6;

@$p1	bra	$L0123C6B8;

bra.uni $L_exit;

$L0123C6B8:

mov.u32 $r7,%physid;

mov.s32	$r9,2;

shl.b32	$r8,$r5,$r9;

ld.param.s32	$r11,p_a;

add.s32	$r10,$r11,$r8;

mov.s32	$r12,$r7;

st.global.s32	[$r10],$r12;

$L_exit:exit;

}

output of 32 blocks, 64 threads:

no error

00000000  00000001  00000002  00000003  00000004  00000005  00000006  00000007

00000008  00000009  0000000a  0000000b  0000000c  0000000d  0000000e  0000000f

00000010  00000011  00000012  00000013  00000014  00000015  00000016  00000017

00000018  00000019  0000001a  0000001b  0000001c  0000001d  0000001e  0000001f

00000100  00000101  00000102  00000103  00000104  00000105  00000106  00000107

00000108  00000109  0000010a  0000010b  0000010c  0000010d  0000010e  0000010f

00000110  00000111  00000112  00000113  00000114  00000115  00000116  00000117

00000118  00000119  0000011a  0000011b  0000011c  0000011d  0000011e  0000011f

00010000  00010001  00010002  00010003  00010004  00010005  00010006  00010007

00010008  00010009  0001000a  0001000b  0001000c  0001000d  0001000e  0001000f

00010010  00010011  00010012  00010013  00010014  00010015  00010016  00010017

00010018  00010019  0001001a  0001001b  0001001c  0001001d  0001001e  0001001f

00010100  00010101  00010102  00010103  00010104  00010105  00010106  00010107

00010108  00010109  0001010a  0001010b  0001010c  0001010d  0001010e  0001010f

00010110  00010111  00010112  00010113  00010114  00010115  00010116  00010117

00010118  00010119  0001011a  0001011b  0001011c  0001011d  0001011e  0001011f

00100000  00100001  00100002  00100003  00100004  00100005  00100006  00100007

00100008  00100009  0010000a  0010000b  0010000c  0010000d  0010000e  0010000f

00100010  00100011  00100012  00100013  00100014  00100015  00100016  00100017

00100018  00100019  0010001a  0010001b  0010001c  0010001d  0010001e  0010001f

00100100  00100101  00100102  00100103  00100104  00100105  00100106  00100107

00100108  00100109  0010010a  0010010b  0010010c  0010010d  0010010e  0010010f

00100110  00100111  00100112  00100113  00100114  00100115  00100116  00100117

00100118  00100119  0010011a  0010011b  0010011c  0010011d  0010011e  0010011f

00110000  00110001  00110002  00110003  00110004  00110005  00110006  00110007

00110008  00110009  0011000a  0011000b  0011000c  0011000d  0011000e  0011000f

00110010  00110011  00110012  00110013  00110014  00110015  00110016  00110017

00110018  00110019  0011001a  0011001b  0011001c  0011001d  0011001e  0011001f

00110100  00110101  00110102  00110103  00110104  00110105  00110106  00110107

00110108  00110109  0011010a  0011010b  0011010c  0011010d  0011010e  0011010f

00110110  00110111  00110112  00110113  00110114  00110115  00110116  00110117

00110118  00110119  0011011a  0011011b  0011011c  0011011d  0011011e  0011011f

00200000  00200001  00200002  00200003  00200004  00200005  00200006  00200007

00200008  00200009  0020000a  0020000b  0020000c  0020000d  0020000e  0020000f

00200010  00200011  00200012  00200013  00200014  00200015  00200016  00200017

00200018  00200019  0020001a  0020001b  0020001c  0020001d  0020001e  0020001f

00200100  00200101  00200102  00200103  00200104  00200105  00200106  00200107

00200108  00200109  0020010a  0020010b  0020010c  0020010d  0020010e  0020010f

00200110  00200111  00200112  00200113  00200114  00200115  00200116  00200117

00200118  00200119  0020011a  0020011b  0020011c  0020011d  0020011e  0020011f

00210000  00210001  00210002  00210003  00210004  00210005  00210006  00210007

00210008  00210009  0021000a  0021000b  0021000c  0021000d  0021000e  0021000f

00210010  00210011  00210012  00210013  00210014  00210015  00210016  00210017

00210018  00210019  0021001a  0021001b  0021001c  0021001d  0021001e  0021001f

00210100  00210101  00210102  00210103  00210104  00210105  00210106  00210107

00210108  00210109  0021010a  0021010b  0021010c  0021010d  0021010e  0021010f

00210110  00210111  00210112  00210113  00210114  00210115  00210116  00210117

00210118  00210119  0021011a  0021011b  0021011c  0021011d  0021011e  0021011f

00300000  00300001  00300002  00300003  00300004  00300005  00300006  00300007

00300008  00300009  0030000a  0030000b  0030000c  0030000d  0030000e  0030000f

00300010  00300011  00300012  00300013  00300014  00300015  00300016  00300017

00300018  00300019  0030001a  0030001b  0030001c  0030001d  0030001e  0030001f

00300100  00300101  00300102  00300103  00300104  00300105  00300106  00300107

00300108  00300109  0030010a  0030010b  0030010c  0030010d  0030010e  0030010f

00300110  00300111  00300112  00300113  00300114  00300115  00300116  00300117

00300118  00300119  0030011a  0030011b  0030011c  0030011d  0030011e  0030011f

00310000  00310001  00310002  00310003  00310004  00310005  00310006  00310007

00310008  00310009  0031000a  0031000b  0031000c  0031000d  0031000e  0031000f

00310010  00310011  00310012  00310013  00310014  00310015  00310016  00310017

00310018  00310019  0031001a  0031001b  0031001c  0031001d  0031001e  0031001f

00310100  00310101  00310102  00310103  00310104  00310105  00310106  00310107

00310108  00310109  0031010a  0031010b  0031010c  0031010d  0031010e  0031010f

00310110  00310111  00310112  00310113  00310114  00310115  00310116  00310117

00310118  00310119  0031011a  0031011b  0031011c  0031011d  0031011e  0031011f

00400000  00400001  00400002  00400003  00400004  00400005  00400006  00400007

00400008  00400009  0040000a  0040000b  0040000c  0040000d  0040000e  0040000f

00400010  00400011  00400012  00400013  00400014  00400015  00400016  00400017

00400018  00400019  0040001a  0040001b  0040001c  0040001d  0040001e  0040001f

00400100  00400101  00400102  00400103  00400104  00400105  00400106  00400107

00400108  00400109  0040010a  0040010b  0040010c  0040010d  0040010e  0040010f

00400110  00400111  00400112  00400113  00400114  00400115  00400116  00400117

00400118  00400119  0040011a  0040011b  0040011c  0040011d  0040011e  0040011f

00410000  00410001  00410002  00410003  00410004  00410005  00410006  00410007

00410008  00410009  0041000a  0041000b  0041000c  0041000d  0041000e  0041000f

00410010  00410011  00410012  00410013  00410014  00410015  00410016  00410017

00410018  00410019  0041001a  0041001b  0041001c  0041001d  0041001e  0041001f

00410100  00410101  00410102  00410103  00410104  00410105  00410106  00410107

00410108  00410109  0041010a  0041010b  0041010c  0041010d  0041010e  0041010f

00410110  00410111  00410112  00410113  00410114  00410115  00410116  00410117

00410118  00410119  0041011a  0041011b  0041011c  0041011d  0041011e  0041011f

00500000  00500001  00500002  00500003  00500004  00500005  00500006  00500007

00500008  00500009  0050000a  0050000b  0050000c  0050000d  0050000e  0050000f

00500010  00500011  00500012  00500013  00500014  00500015  00500016  00500017

00500018  00500019  0050001a  0050001b  0050001c  0050001d  0050001e  0050001f

00500100  00500101  00500102  00500103  00500104  00500105  00500106  00500107

00500108  00500109  0050010a  0050010b  0050010c  0050010d  0050010e  0050010f

00500110  00500111  00500112  00500113  00500114  00500115  00500116  00500117

00500118  00500119  0050011a  0050011b  0050011c  0050011d  0050011e  0050011f

00510000  00510001  00510002  00510003  00510004  00510005  00510006  00510007

00510008  00510009  0051000a  0051000b  0051000c  0051000d  0051000e  0051000f

00510010  00510011  00510012  00510013  00510014  00510015  00510016  00510017

00510018  00510019  0051001a  0051001b  0051001c  0051001d  0051001e  0051001f

00510100  00510101  00510102  00510103  00510104  00510105  00510106  00510107

00510108  00510109  0051010a  0051010b  0051010c  0051010d  0051010e  0051010f

00510110  00510111  00510112  00510113  00510114  00510115  00510116  00510117

00510118  00510119  0051011a  0051011b  0051011c  0051011d  0051011e  0051011f

00600000  00600001  00600002  00600003  00600004  00600005  00600006  00600007

00600008  00600009  0060000a  0060000b  0060000c  0060000d  0060000e  0060000f

00600010  00600011  00600012  00600013  00600014  00600015  00600016  00600017

00600018  00600019  0060001a  0060001b  0060001c  0060001d  0060001e  0060001f

00600100  00600101  00600102  00600103  00600104  00600105  00600106  00600107

00600108  00600109  0060010a  0060010b  0060010c  0060010d  0060010e  0060010f

00600110  00600111  00600112  00600113  00600114  00600115  00600116  00600117

00600118  00600119  0060011a  0060011b  0060011c  0060011d  0060011e  0060011f

00610000  00610001  00610002  00610003  00610004  00610005  00610006  00610007

00610008  00610009  0061000a  0061000b  0061000c  0061000d  0061000e  0061000f

00610010  00610011  00610012  00610013  00610014  00610015  00610016  00610017

00610018  00610019  0061001a  0061001b  0061001c  0061001d  0061001e  0061001f

00610100  00610101  00610102  00610103  00610104  00610105  00610106  00610107

00610108  00610109  0061010a  0061010b  0061010c  0061010d  0061010e  0061010f

00610110  00610111  00610112  00610113  00610114  00610115  00610116  00610117

00610118  00610119  0061011a  0061011b  0061011c  0061011d  0061011e  0061011f

00700000  00700001  00700002  00700003  00700004  00700005  00700006  00700007

00700008  00700009  0070000a  0070000b  0070000c  0070000d  0070000e  0070000f

00700010  00700011  00700012  00700013  00700014  00700015  00700016  00700017

00700018  00700019  0070001a  0070001b  0070001c  0070001d  0070001e  0070001f

00700100  00700101  00700102  00700103  00700104  00700105  00700106  00700107

00700108  00700109  0070010a  0070010b  0070010c  0070010d  0070010e  0070010f

00700110  00700111  00700112  00700113  00700114  00700115  00700116  00700117

00700118  00700119  0070011a  0070011b  0070011c  0070011d  0070011e  0070011f

00710000  00710001  00710002  00710003  00710004  00710005  00710006  00710007

00710008  00710009  0071000a  0071000b  0071000c  0071000d  0071000e  0071000f

00710010  00710011  00710012  00710013  00710014  00710015  00710016  00710017

00710018  00710019  0071001a  0071001b  0071001c  0071001d  0071001e  0071001f

00710100  00710101  00710102  00710103  00710104  00710105  00710106  00710107

00710108  00710109  0071010a  0071010b  0071010c  0071010d  0071010e  0071010f

00710110  00710111  00710112  00710113  00710114  00710115  00710116  00710117

00710118  00710119  0071011a  0071011b  0071011c  0071011d  0071011e  0071011f

00000200  00000201  00000202  00000203  00000204  00000205  00000206  00000207

00000208  00000209  0000020a  0000020b  0000020c  0000020d  0000020e  0000020f

00000210  00000211  00000212  00000213  00000214  00000215  00000216  00000217

00000218  00000219  0000021a  0000021b  0000021c  0000021d  0000021e  0000021f

00000300  00000301  00000302  00000303  00000304  00000305  00000306  00000307

00000308  00000309  0000030a  0000030b  0000030c  0000030d  0000030e  0000030f

00000310  00000311  00000312  00000313  00000314  00000315  00000316  00000317

00000318  00000319  0000031a  0000031b  0000031c  0000031d  0000031e  0000031f

00010200  00010201  00010202  00010203  00010204  00010205  00010206  00010207

00010208  00010209  0001020a  0001020b  0001020c  0001020d  0001020e  0001020f

00010210  00010211  00010212  00010213  00010214  00010215  00010216  00010217

00010218  00010219  0001021a  0001021b  0001021c  0001021d  0001021e  0001021f

00010300  00010301  00010302  00010303  00010304  00010305  00010306  00010307

00010308  00010309  0001030a  0001030b  0001030c  0001030d  0001030e  0001030f

00010310  00010311  00010312  00010313  00010314  00010315  00010316  00010317

00010318  00010319  0001031a  0001031b  0001031c  0001031d  0001031e  0001031f

00100200  00100201  00100202  00100203  00100204  00100205  00100206  00100207

00100208  00100209  0010020a  0010020b  0010020c  0010020d  0010020e  0010020f

00100210  00100211  00100212  00100213  00100214  00100215  00100216  00100217

00100218  00100219  0010021a  0010021b  0010021c  0010021d  0010021e  0010021f

00100300  00100301  00100302  00100303  00100304  00100305  00100306  00100307

00100308  00100309  0010030a  0010030b  0010030c  0010030d  0010030e  0010030f

00100310  00100311  00100312  00100313  00100314  00100315  00100316  00100317

00100318  00100319  0010031a  0010031b  0010031c  0010031d  0010031e  0010031f

00110200  00110201  00110202  00110203  00110204  00110205  00110206  00110207

00110208  00110209  0011020a  0011020b  0011020c  0011020d  0011020e  0011020f

00110210  00110211  00110212  00110213  00110214  00110215  00110216  00110217

00110218  00110219  0011021a  0011021b  0011021c  0011021d  0011021e  0011021f

00110300  00110301  00110302  00110303  00110304  00110305  00110306  00110307

00110308  00110309  0011030a  0011030b  0011030c  0011030d  0011030e  0011030f

00110310  00110311  00110312  00110313  00110314  00110315  00110316  00110317

00110318  00110319  0011031a  0011031b  0011031c  0011031d  0011031e  0011031f

00200200  00200201  00200202  00200203  00200204  00200205  00200206  00200207

00200208  00200209  0020020a  0020020b  0020020c  0020020d  0020020e  0020020f

00200210  00200211  00200212  00200213  00200214  00200215  00200216  00200217

00200218  00200219  0020021a  0020021b  0020021c  0020021d  0020021e  0020021f

00200300  00200301  00200302  00200303  00200304  00200305  00200306  00200307

00200308  00200309  0020030a  0020030b  0020030c  0020030d  0020030e  0020030f

00200310  00200311  00200312  00200313  00200314  00200315  00200316  00200317

00200318  00200319  0020031a  0020031b  0020031c  0020031d  0020031e  0020031f

00210200  00210201  00210202  00210203  00210204  00210205  00210206  00210207

00210208  00210209  0021020a  0021020b  0021020c  0021020d  0021020e  0021020f

00210210  00210211  00210212  00210213  00210214  00210215  00210216  00210217

00210218  00210219  0021021a  0021021b  0021021c  0021021d  0021021e  0021021f

00210300  00210301  00210302  00210303  00210304  00210305  00210306  00210307

00210308  00210309  0021030a  0021030b  0021030c  0021030d  0021030e  0021030f

00210310  00210311  00210312  00210313  00210314  00210315  00210316  00210317

00210318  00210319  0021031a  0021031b  0021031c  0021031d  0021031e  0021031f

00300200  00300201  00300202  00300203  00300204  00300205  00300206  00300207

00300208  00300209  0030020a  0030020b  0030020c  0030020d  0030020e  0030020f

00300210  00300211  00300212  00300213  00300214  00300215  00300216  00300217

00300218  00300219  0030021a  0030021b  0030021c  0030021d  0030021e  0030021f

00300300  00300301  00300302  00300303  00300304  00300305  00300306  00300307

00300308  00300309  0030030a  0030030b  0030030c  0030030d  0030030e  0030030f

00300310  00300311  00300312  00300313  00300314  00300315  00300316  00300317

00300318  00300319  0030031a  0030031b  0030031c  0030031d  0030031e  0030031f

00310200  00310201  00310202  00310203  00310204  00310205  00310206  00310207

00310208  00310209  0031020a  0031020b  0031020c  0031020d  0031020e  0031020f

00310210  00310211  00310212  00310213  00310214  00310215  00310216  00310217

00310218  00310219  0031021a  0031021b  0031021c  0031021d  0031021e  0031021f

00310300  00310301  00310302  00310303  00310304  00310305  00310306  00310307

00310308  00310309  0031030a  0031030b  0031030c  0031030d  0031030e  0031030f

00310310  00310311  00310312  00310313  00310314  00310315  00310316  00310317

00310318  00310319  0031031a  0031031b  0031031c  0031031d  0031031e  0031031f

00400200  00400201  00400202  00400203  00400204  00400205  00400206  00400207

00400208  00400209  0040020a  0040020b  0040020c  0040020d  0040020e  0040020f

00400210  00400211  00400212  00400213  00400214  00400215  00400216  00400217

00400218  00400219  0040021a  0040021b  0040021c  0040021d  0040021e  0040021f

00400300  00400301  00400302  00400303  00400304  00400305  00400306  00400307

00400308  00400309  0040030a  0040030b  0040030c  0040030d  0040030e  0040030f

00400310  00400311  00400312  00400313  00400314  00400315  00400316  00400317

00400318  00400319  0040031a  0040031b  0040031c  0040031d  0040031e  0040031f

00410200  00410201  00410202  00410203  00410204  00410205  00410206  00410207

00410208  00410209  0041020a  0041020b  0041020c  0041020d  0041020e  0041020f

00410210  00410211  00410212  00410213  00410214  00410215  00410216  00410217

00410218  00410219  0041021a  0041021b  0041021c  0041021d  0041021e  0041021f

00410300  00410301  00410302  00410303  00410304  00410305  00410306  00410307

00410308  00410309  0041030a  0041030b  0041030c  0041030d  0041030e  0041030f

00410310  00410311  00410312  00410313  00410314  00410315  00410316  00410317

00410318  00410319  0041031a  0041031b  0041031c  0041031d  0041031e  0041031f

00500200  00500201  00500202  00500203  00500204  00500205  00500206  00500207

00500208  00500209  0050020a  0050020b  0050020c  0050020d  0050020e  0050020f

00500210  00500211  00500212  00500213  00500214  00500215  00500216  00500217

00500218  00500219  0050021a  0050021b  0050021c  0050021d  0050021e  0050021f

00500300  00500301  00500302  00500303  00500304  00500305  00500306  00500307

00500308  00500309  0050030a  0050030b  0050030c  0050030d  0050030e  0050030f

00500310  00500311  00500312  00500313  00500314  00500315  00500316  00500317

00500318  00500319  0050031a  0050031b  0050031c  0050031d  0050031e  0050031f

00510200  00510201  00510202  00510203  00510204  00510205  00510206  00510207

00510208  00510209  0051020a  0051020b  0051020c  0051020d  0051020e  0051020f

00510210  00510211  00510212  00510213  00510214  00510215  00510216  00510217

00510218  00510219  0051021a  0051021b  0051021c  0051021d  0051021e  0051021f

00510300  00510301  00510302  00510303  00510304  00510305  00510306  00510307

00510308  00510309  0051030a  0051030b  0051030c  0051030d  0051030e  0051030f

00510310  00510311  00510312  00510313  00510314  00510315  00510316  00510317

00510318  00510319  0051031a  0051031b  0051031c  0051031d  0051031e  0051031f

00600200  00600201  00600202  00600203  00600204  00600205  00600206  00600207

00600208  00600209  0060020a  0060020b  0060020c  0060020d  0060020e  0060020f

00600210  00600211  00600212  00600213  00600214  00600215  00600216  00600217

00600218  00600219  0060021a  0060021b  0060021c  0060021d  0060021e  0060021f

00600300  00600301  00600302  00600303  00600304  00600305  00600306  00600307

00600308  00600309  0060030a  0060030b  0060030c  0060030d  0060030e  0060030f

00600310  00600311  00600312  00600313  00600314  00600315  00600316  00600317

00600318  00600319  0060031a  0060031b  0060031c  0060031d  0060031e  0060031f

00610200  00610201  00610202  00610203  00610204  00610205  00610206  00610207

00610208  00610209  0061020a  0061020b  0061020c  0061020d  0061020e  0061020f

00610210  00610211  00610212  00610213  00610214  00610215  00610216  00610217

00610218  00610219  0061021a  0061021b  0061021c  0061021d  0061021e  0061021f

00610300  00610301  00610302  00610303  00610304  00610305  00610306  00610307

00610308  00610309  0061030a  0061030b  0061030c  0061030d  0061030e  0061030f

00610310  00610311  00610312  00610313  00610314  00610315  00610316  00610317

00610318  00610319  0061031a  0061031b  0061031c  0061031d  0061031e  0061031f

00700200  00700201  00700202  00700203  00700204  00700205  00700206  00700207

00700208  00700209  0070020a  0070020b  0070020c  0070020d  0070020e  0070020f

00700210  00700211  00700212  00700213  00700214  00700215  00700216  00700217

00700218  00700219  0070021a  0070021b  0070021c  0070021d  0070021e  0070021f

00700300  00700301  00700302  00700303  00700304  00700305  00700306  00700307

00700308  00700309  0070030a  0070030b  0070030c  0070030d  0070030e  0070030f

00700310  00700311  00700312  00700313  00700314  00700315  00700316  00700317

00700318  00700319  0070031a  0070031b  0070031c  0070031d  0070031e  0070031f

00710200  00710201  00710202  00710203  00710204  00710205  00710206  00710207

00710208  00710209  0071020a  0071020b  0071020c  0071020d  0071020e  0071020f

00710210  00710211  00710212  00710213  00710214  00710215  00710216  00710217

00710218  00710219  0071021a  0071021b  0071021c  0071021d  0071021e  0071021f

00710300  00710301  00710302  00710303  00710304  00710305  00710306  00710307

00710308  00710309  0071030a  0071030b  0071030c  0071030d  0071030e  0071030f

00710310  00710311  00710312  00710313  00710314  00710315  00710316  00710317

00710318  00710319  0071031a  0071031b  0071031c  0071031d  0071031e  0071031f

%pm? are all 0 in my test.

my interpret of %physid (8800GTX):

multiprocessor_ID0x10000+warp_ID0x100+position_in_warp

multiprocessor_ID seems to be one of the following 16 values:

00 10 20 30 40 50 60 70

01 11 21 31 41 51 61 71

I haven’t figured out what warp_ID means yet. It’s not simply counted up in each block.

position_in_warp is 0 to 31

This could indeed be used to implement per-block temporary stuff.

Interesting - forget about blocks - warp ID just counts 0-24 on an MP most likely (consistent with above as you only used warps 0-3 on each MP). So your decode looks right. Definitely 2 MPs are tied pretty closely together - my guess is that the die for G80 has 2 MPs, perhaps 2 die to the package as I think someone mentioned there are 4 packages on an 8800?? Anyhow looks like Nvidia left space for up to 16 MPs on a die.

So this is a bit messy and one can’t really use it till we can inline PTX. Why I asked about a const extern to access %physid but no answer. I had noticed it in the compiler source before the PTX spec came out (could not find any expression). %pm are performance monitors (according to compiler source) and there have been hints that tools are being built to assist in performance tuning so perhaps they are going to use these. Connected to counters of specific events?? Like idle cycles?? As usual we don’t need to know so won’t be told yet.

Extending shared one is limited to indexing by blockID and dicing your app so that you don’t use more blocks than can run on the card concurrently.

Eric

Interesting, indeed.

Ah, makes sense. As they are all 0, perhaps even some hardware piece is still missing. Let’s see what G90 brings.

Peter

.version 1.0

.target compute_10, map_f64_to_f32

//by ctc's be_ptx

.extern	.shared .align 16 .b8 sharedbase[];

.entry e011e760c_tester

{

.param .u32 p_entering;

.param .u32 p_ret;

.param .s32 p_n;

.reg .u32	$r1,$r2,$r3,$r4,$r5,$r6,$r7,$r8,$r9,$r10,$r11,$r12,$r13,$r14,$r15,$r16,$r17,$r18,$r19,$r20,$r21,$r22,$r23,$r24,$r25,$r26,$r27,$r28,$r29,$r30,$r31,$r32,$r33,$r34,$r35,$r36,$r37,$r38,$r39,$r40,$r41,$r42,$r43,$r44,$r45,$r46,$r47,$r48,$r49,$r50,$r51,$r52,$r53,$r54,$r55,$r56,$r57,$r58,$r59,$r60,$r61,$r62,$r63,$r64,$r65,$r66,$r67;

.reg .pred	$p1,$p2,$p3,$p4,$p5,$p6,$p7,$p8;

cvt.u32.u16 $r1,%tid.x;

mov.s32	$r2,$r1;

$L012454D0:

mov.s32	$r3,0;

setp.eq.s32	$p1,$r2,$r3;

@$p1	bra	$L0123DC18;

bra.uni $L_exit;

$L0123DC18:

ld.param.s32	$r5,p_entering;

mov.s32	$r6,64;

add.s32	$r4,$r5,$r6;

mov.s32	$r7,$r4;

mov.u32 $r8,%physid;

mov.s32	$r9,$r8;

$L0123E050:

mov.s32	$r10,$r9;

mov.s32	$r12,16;

shr.s32	$r11,$r10,$r12;

mov.s32	$r13,1;

and.b32	$r11,$r13,$r11;

mov.s32	$r15,19;

shr.s32	$r14,$r10,$r15;

add.s32	$r11,$r11,$r14;

mov.s32	$r16,$r11;

$L0123D8B8:

mov.s32	$r17,$r16;

mov.s32	$r19,2;

shl.b32	$r18,$r17,$r19;

ld.param.s32	$r20,p_entering;

add.s32	$r18,$r20,$r18;

mov.s32	$r22,1;

mov.s32	$r21,$r22;

st.global.s32	[$r18],$r21;

mov.s32	$r23,0;

mov.s32	$r24,0;

bra.uni	$L0123E638;

$L0123E710:

mov.s32	$r26,2;

shl.b32	$r25,$r24,$r26;

add.s32	$r25,$r7,$r25;

ld.global.s32	$r27,[$r25];

mov.s32	$r28,$r23;

max.s32 $r29,$r28,$r27;

mov.s32	$r30,$r29;

$L012423D0:

mov.s32	$r23,$r30;

add.s32	$r24,$r24,1;

$L0123E638:

mov.s32	$r31,15;

setp.le.s32	$p2,$r24,$r31;

@$p2	bra	$L0123E710;

mov.s32	$r32,$r23;

add.s32	$r23,$r23,1;

mov.s32	$r34,2;

shl.b32	$r33,$r17,$r34;

add.s32	$r33,$r7,$r33;

mov.s32	$r35,$r23;

st.global.s32	[$r33],$r35;

mov.s32	$r37,2;

shl.b32	$r36,$r17,$r37;

ld.param.s32	$r38,p_entering;

add.s32	$r36,$r38,$r36;

mov.s32	$r40,0;

mov.s32	$r39,$r40;

st.global.s32	[$r36],$r39;

mov.s32	$r41,0;

bra.uni	$L0123FEB0;

$L0123FF88:

bra.uni	$L01240498;

$L012403C0:

$L01240498:

mov.s32	$r43,2;

shl.b32	$r42,$r41,$r43;

ld.param.s32	$r44,p_entering;

add.s32	$r42,$r44,$r42;

ld.global.s32	$r45,[$r42];

mov.s32	$r46,0;

setp.ne.s32	$p3,$r45,$r46;

@$p3	bra	$L012403C0;

bra.uni	$L01240C30;

$L01240B58:

$L01240C30:

mov.s32	$r48,2;

shl.b32	$r47,$r41,$r48;

add.s32	$r47,$r7,$r47;

ld.global.s32	$r49,[$r47];

mov.s32	$r50,0;

setp.eq.s32	$p4,$r49,$r50;

@$p4	bra	$L01240D08;

mov.s32	$r52,2;

shl.b32	$r51,$r41,$r52;

add.s32	$r51,$r7,$r51;

ld.global.s32	$r53,[$r51];

setp.lt.s32	$p5,$r53,$r23;

@$p5	bra	$L01240B58;

mov.s32	$r55,2;

shl.b32	$r54,$r41,$r55;

add.s32	$r54,$r7,$r54;

ld.global.s32	$r56,[$r54];

setp.ne.s32	$p6,$r56,$r23;

@$p6	bra	$L01240D08;

setp.lt.s32	$p7,$r41,$r17;

@$p7	bra	$L01240B58;

$L01240D08:

add.s32	$r41,$r41,1;

$L0123FEB0:

mov.s32	$r57,15;

setp.le.s32	$p8,$r41,$r57;

@$p8	bra	$L0123FF88;

cvt.u32.u16 $r58,%ctaid.x;

mov.s32	$r59,$r58;

$L01245D40:

add.s32	$r59,$r59,1;

ld.param.s32	$r61,p_ret;

ld.param.s32	$r63,p_ret;

ld.global.s32	$r62,[$r63];

add.s32	$r60,$r62,$r59;

st.global.s32	[$r61],$r60;

mov.s32	$r65,2;

shl.b32	$r64,$r17,$r65;

add.s32	$r64,$r7,$r64;

mov.s32	$r67,0;

mov.s32	$r66,$r67;

st.global.s32	[$r64],$r66;

$L_exit:exit;

}

Successfully implemented block-wise atomic operation on 8800 using Lamport’s bakery algorithm and %physid.

My guess should be correct.

Ah, cool. Did you run it with less blocks than MPs in an iteration ? That is, I wonder if the block IDs are “stable” if the code is called iteratively by the kernel (= iteration on GPU not on the host).

Peter

I did. It seems to be so, but I’m not quite sure since I can only test trivial kernels. The Lamport@&#% lock doesn’t depends on that though:)