I’m coding an expansive algorithm by hand; a copy of the in-progress PTX source is below for reference. When I compile it using ptxas -v, I get this result:
ptxas info : Compiling entry function 'iter_kernel'
ptxas info : Used 9 registers, 3256+3224 bytes smem, 62724 bytes cmem[0], 20 bytes cmem[1]
Assuming linear allocation, my shared memory usage should (256 * (f32 + f32 + u32) + 32 * (u32) + 8 * (u32)) = 3456 bytes (plus 8 for the params). decuda doesn’t give any extra information about how smem is allocated. Is there any documentation or knowledge that can help me understand what ‘3256+3224’ means in this context?
Thanks for your help!
.version 1.1
.target sm_11, map_f64_to_f32
.const .f32 camera[1536]; // 32 * 48
.const .u16 xmeta[1664]; // 32 * 52
.const .u8 colormap_r[8192]; // 32 * 256
.const .u8 colormap_g[8192];
.const .u8 colormap_b[8192];
.const .b32 xf_stream[7168]; // the partitioning here is all host-side, this code doesn't care
.const .u32 g_rseed;
.entry iter_kernel
{
.param .u32 id; // Index of this block's helper
.param .u32 g_debug;
.reg .s32 i; // always an iterator
.reg .u32 rmul; // local random multiplier
.reg .u64 seed; // and last-generated digits
.reg .u32 t1, t2, t3; // temporary helpers
.reg .f32 f1, f2; // float temps
.reg .pred p;
.shared .f32 pts_x[256]; // 8 * 32
.shared .f32 pts_y[256];
.shared .u32 colors[256]; // Can almost certainly be changed to u16
.shared .u32 offsets[32]; // 1 * 32
.shared .u32 vote[8]; // 8 * 1
// Load seeds. Truly sorry about the 4D address calculation:
// h.g_rseed[(((((id * nctaid.x) + ctaid.x) * ntid.y) + tid.y) * ntid.x) + tid.x]
// Seeds are in a uint2 in the format {multiplier, seed}
ld.param.u32 t1, id;
cvt.u32.u16 t2, %nctaid.x;
cvt.u32.u16 t3, %ctaid.x;
mad24.lo.u32 t1, t1, t2, t3;
cvt.u32.u16 t2, %ntid.y;
cvt.u32.u16 t3, %tid.y;
mad24.lo.u32 t1, t1, t2, t3;
cvt.u32.u16 t2, %ntid.x;
cvt.u32.u16 t3, %tid.x;
mad24.lo.u32 t1, t1, t2, t3;
ld.const.u32 t3, [g_rseed];
mad24.lo.u32 t1, t1, 8, t3; // sizeof(.v2.u32) == 8
ld.global.v2.u32 {rmul, t3}, [t1];
cvt.u64.u32 seed, t3;
// Load carry value. This is an arbitrary number less than rmul; the one below
// is the canonical value but really anything nonzero is fine. It's okay to share.
add.u64 seed, seed, 1234567 << 32;
// Each thread generates random data for x, y, and colors based on thread_id.
cvt.u32.u16 t1, %tid.y;
cvt.u32.u16 t2, %ntid.y;
cvt.u32.u16 t3, %tid.x;
mad24.lo.u32 t1, t1, t2, t3;
// Generate a new random int in t2. You'll see these three lines a lot.
cvt.u32.u64 t2, seed;
shr.u64 seed, seed, 32;
mad.wide.u32 seed, rmul, t2, seed;
// Convert to float in range [-1.0, 1.0). Also popular. constant is (1 / (2 ^ 28)).
shr.u32 t2, t2, 3;
cvt.rn.f32.s32 f1, t2;
mad.f32 f1, f1, .0000000037252902984619140625, -1.0;
st.shared.f32 pts_x[t1], f1;
// Loop unrolling. Copy and paste never sounded so fancy.
cvt.u32.u64 t2, seed;
shr.u64 seed, seed, 32;
mad.wide.u32 seed, rmul, t2, seed;
shr.u32 t2, t2, 3;
cvt.rn.f32.s32 f1, t2;
mad.f32 f1, f1, .0000000037252902984619140625, -1.0;
st.shared.f32 pts_y[t1], f1;
// We leave this one in int format
cvt.u32.u64 t2, seed;
shr.u64 seed, seed, 32;
mad.wide.u32 seed, rmul, t2, seed;
st.shared.u32 colors[t1], t2;
ILOOP_START:
// Pick a transform ... obviously from here to the end much code is missing
// Another of these ugly addres constructs gets us in to the debug pointers
ld.shared.f32 f1, pts_x[t1];
ld.param.u32 t1, id;
cvt.u32.u16 t2, %nctaid.x;
cvt.u32.u16 t3, %ctaid.x;
mad24.lo.u32 t1, t1, t2, t3;
cvt.u32.u16 t2, %ntid.y;
cvt.u32.u16 t3, %tid.y;
mad24.lo.u32 t1, t1, t2, t3;
cvt.u32.u16 t2, %ntid.x;
cvt.u32.u16 t3, %tid.x;
mad24.lo.u32 t1, t1, t2, t3;
ld.param.u32 t3, [g_debug];
mad24.lo.u32 t1, t1, 4, t3;
st.global.f32 [t1], f1;
exit;
}
// vim: set ts=6 sts=0 noexpandtab :