Well, they have said that a new CUDA beta is likely by the end of the month: maybe they are working on that :)
Regarding the side by side C and PTX, I’m not sure what problem you are having. Maybe you aren’t specifying -keep or -ptx to keep the generated ptx?
test.cu
__global__ void kernel(float *d_out)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
float a = idx*0.05f + 4.2f;
for (int i = 0; i < idx; i++)
a += 0.01f;
d_out[idx] = a;
}
nvcc -keep --opencc-options -LIST:source=on test.cu
test.ptx (abbreviated)
// 1 __global__ void kernel(float *d_out)
$LBB1__Z6kernelPf:
.loc 12 5 0
// 2 {
// 3 int idx = blockIdx.x * blockDim.x + threadIdx.x;
// 4
// 5 float a = idx*0.05f + 4.2f;
mov.u16 $rh1, %ctaid.x; //
mov.u16 $rh2, %ntid.x; //
mul.wide.u16 $r1, $rh1, $rh2; //
cvt.u32.u16 $r2, %tid.x; //
add.u32 $r3, $r2, $r1; //
mov.f32 $f1, 0f40866666; // 4.2
cvt.rn.f32.s32 $f2, $r3; //
mov.f32 $f3, 0f3d4ccccd; // 0.05
mad.f32 $f4, $f2, $f3, $f1; //
mov.s32 $r4, 0; //
setp.le.s32 $p1, $r3, $r4; //
@$p1 bra $Lt_0_5; //
mov.s32 $r5, $r3; //
mov.s32 $r6, 0; //
mov.s32 $r7, $r5; //
$Lt_0_7:
//<loop> Loop body line 5, nesting depth: 1, estimated iterations: unknown
.loc 12 7 0
// 6 for (int i = 0; i < idx; i++)
// 7 a += 0.01f;
mov.f32 $f5, 0f3c23d70a; // 0.01
add.f32 $f4, $f4, $f5; //
add.s32 $r6, $r6, 1; //
setp.ne.s32 $p2, $r6, $r3; //
@$p2 bra $Lt_0_7; //
$Lt_0_5:
.loc 12 9 0
// 8
// 9 d_out[idx] = a;
ld.param.u64 $rd1, [__cudaparm__Z6kernelPf_d_out]; // id:25 __cudaparm__Z6kernelPf_d_out+0x0
cvt.u64.s32 $rd2, $r3; //
mul.lo.u64 $rd3, $rd2, 4; //
add.u64 $rd4, $rd1, $rd3; //
st.global.f32 [$rd4+0], $f4; // id:26
exit; //
} // _Z6kernelPf
This is with CUDA 1.1 btw.