ptxas -v: What's the smem after the plus sign? "ptxas info: 3256+3224 bytes smem"

I’m coding an expansive algorithm by hand; a copy of the in-progress PTX source is below for reference. When I compile it using ptxas -v, I get this result:

ptxas info    : Compiling entry function 'iter_kernel'

ptxas info    : Used 9 registers, 3256+3224 bytes smem, 62724 bytes cmem[0], 20 bytes cmem[1]

Assuming linear allocation, my shared memory usage should (256 * (f32 + f32 + u32) + 32 * (u32) + 8 * (u32)) = 3456 bytes (plus 8 for the params). decuda doesn’t give any extra information about how smem is allocated. Is there any documentation or knowledge that can help me understand what ‘3256+3224’ means in this context?

Thanks for your help!

.version 1.1

.target sm_11, map_f64_to_f32

.const .f32  camera[1536];  // 32 * 48

.const .u16  xmeta[1664];  	// 32 * 52

.const .u8  colormap_r[8192];	// 32 * 256

.const .u8  colormap_g[8192];

.const .u8  colormap_b[8192];

.const .b32  xf_stream[7168];	// the partitioning here is all host-side, this code doesn't care

.const .u32  g_rseed;

.entry iter_kernel 

{

	.param .u32 id;  // Index of this block's helper

	.param .u32 g_debug;

	.reg .s32 i;  	// always an iterator

	.reg .u32 rmul;  // local random multiplier

	.reg .u64 seed;  // and last-generated digits

	.reg .u32 t1, t2, t3;  // temporary helpers

	.reg .f32 f1, f2;  // float temps

	.reg .pred p;

	.shared .f32	pts_x[256];  // 8 * 32

	.shared .f32	pts_y[256];

	.shared .u32	colors[256]; // Can almost certainly be changed to u16

	.shared .u32	offsets[32]; // 1 * 32

	.shared .u32	vote[8];     // 8 * 1

	// Load seeds.  Truly sorry about the 4D address calculation:

	// h.g_rseed[(((((id * nctaid.x) + ctaid.x) * ntid.y) + tid.y) * ntid.x) + tid.x]

	// Seeds are in a uint2 in the format {multiplier, seed}

	ld.param.u32	t1,  id;

	cvt.u32.u16  t2,  %nctaid.x;

	cvt.u32.u16  t3,  %ctaid.x;

	mad24.lo.u32	t1,  t1,  t2,  t3;

	cvt.u32.u16  t2,  %ntid.y;

	cvt.u32.u16  t3,  %tid.y;

	mad24.lo.u32	t1,  t1,  t2,  t3;

	cvt.u32.u16  t2,  %ntid.x;

	cvt.u32.u16  t3,  %tid.x;

	mad24.lo.u32	t1,  t1,  t2,  t3;

	ld.const.u32	t3,  [g_rseed];

	mad24.lo.u32	t1,  t1,  8,  t3; // sizeof(.v2.u32) == 8

	ld.global.v2.u32	{rmul, t3},	[t1];

	cvt.u64.u32  seed,  t3;

	// Load carry value.  This is an arbitrary number less than rmul; the one below

	// is the canonical value but really anything nonzero is fine.  It's okay to share.

	add.u64  seed,  seed,  1234567 << 32;

	// Each thread generates random data for x, y, and colors based on thread_id.

	cvt.u32.u16  t1,  %tid.y;

	cvt.u32.u16  t2,  %ntid.y;

	cvt.u32.u16  t3,  %tid.x;

	mad24.lo.u32	t1,  t1,  t2,  t3;

	// Generate a new random int in t2.  You'll see these three lines a lot.

	cvt.u32.u64  t2,  seed;	

	shr.u64  seed,  seed,  32;

	mad.wide.u32	seed,  rmul,  t2,  	seed;	

	// Convert to float in range [-1.0, 1.0).  Also popular. constant is (1 / (2 ^ 28)).

	shr.u32  t2,  t2,  3;

	cvt.rn.f32.s32	f1,  t2;

	mad.f32  f1,  f1,  .0000000037252902984619140625,  -1.0;

	st.shared.f32	pts_x[t1],	f1;

	// Loop unrolling.  Copy and paste never sounded so fancy.

	cvt.u32.u64  t2,  seed;	

	shr.u64  seed,  seed,  32;

	mad.wide.u32	seed,  rmul,  t2,  	seed;	

	shr.u32  t2,  t2,  3;

	cvt.rn.f32.s32	f1,  t2;

	mad.f32  f1,  f1,  .0000000037252902984619140625,  -1.0;

	st.shared.f32	pts_y[t1],	f1;

	// We leave this one in int format

	cvt.u32.u64  t2,  seed;	

	shr.u64  seed,  seed,  32;

	mad.wide.u32	seed,  rmul,  t2,  	seed;	

	st.shared.u32	colors[t1],	t2;

ILOOP_START:

	

	// Pick a transform ... obviously from here to the end much code is missing

	

	// Another of these ugly addres constructs gets us in to the debug pointers

	

	ld.shared.f32	f1,  pts_x[t1];

	ld.param.u32	t1,  id;

	cvt.u32.u16  t2,  %nctaid.x;

	cvt.u32.u16  t3,  %ctaid.x;

	mad24.lo.u32	t1,  t1,  t2,  t3;

	cvt.u32.u16  t2,  %ntid.y;

	cvt.u32.u16  t3,  %tid.y;

	mad24.lo.u32	t1,  t1,  t2,  t3;

	cvt.u32.u16  t2,  %ntid.x;

	cvt.u32.u16  t3,  %tid.x;

	mad24.lo.u32	t1,  t1,  t2,  t3;

	ld.param.u32	t3,  [g_debug];

	mad24.lo.u32	t1,  t1,  4,  t3;

	st.global.f32	[t1],  f1;

	exit;

}

// vim: set ts=6 sts=0 noexpandtab :

I met the same problem, I cant see what the second number before “smem” stands for.

Thx for the explanation

It was answered here: http://forums.nvidia.com/index.php?showtop…st&p=500344