I was thinking of a tool using which I could get an approximate view of the CUDA C++ code corresponding to a PTX code.
As shown in the following blog by Fei Kong which talks about reverse engineering PTX to CUDA a multiple times.
Given a PTX:
//
// Generated by LLVM NVPTX Back-End
//
.version 8.4
.target sm_75
.address_size 64
// .globl kernel
.visible .entry kernel(
.param .u64 kernel_param_0,
.param .u64 kernel_param_1,
.param .u32 kernel_param_2
)
.maxntid 128, 1, 1
{
.reg .pred %p<3>;
.reg .b32 %r<9>;
.reg .b64 %rd<6>;
.loc 1 9 0
$L__func_begin0:
.loc 1 9 0
ld.param.u64 %rd3, [kernel_param_0];
ld.param.u64 %rd4, [kernel_param_1];
$L__tmp0:
.loc 1 16 24
// begin inline asm
mov.u32 %r1, %ctaid.x;
// end inline asm
.loc 1 17 20
shl.b32 %r4, %r1, 7;
ld.param.u32 %r5, [kernel_param_2];
.loc 1 17 48
mov.u32 %r6, %tid.x;
and.b32 %r7, %r6, 127;
.loc 1 17 35
or.b32 %r8, %r4, %r7;
.loc 1 18 21
setp.lt.s32 %p1, %r8, %r5;
.loc 1 19 24
mul.wide.s32 %rd5, %r8, 4;
add.s64 %rd1, %rd3, %rd5;
.loc 1 19 16
// begin inline asm
mov.u32 %r3, 0x0;
@%p1 ld.global.b32 { %r3 }, [ %rd1 + 0 ];
// end inline asm
.loc 1 20 21
add.s64 %rd2, %rd4, %rd5;
.loc 1 20 30
// begin inline asm
@%p1 st.global.b32 [ %rd2 + 0 ], { %r3 };
// end inline asm
.loc 1 20 4
ret;
$L__tmp1:
$L__func_end0:
}
.file 1 "/tmp/ipykernel_2092734/781878086.py"
.section .debug_abbrev
{
.b8 1
.b8 17
.b8 0
.b8 37
.b8 8
.b8 19
.b8 5
.b8 3
.b8 8
.b8 16
.b8 6
.b8 27
.b8 8
.b8 17
.b8 1
.b8 18
.b8 1
.b8 0
.b8 0
.b8 0
}
.section .debug_info
{
.b32 73
.b8 2
.b8 0
.b32 .debug_abbrev
.b8 8
.b8 1
.b8 116
.b8 114
.b8 105
.b8 116
.b8 111
.b8 110
.b8 0
.b8 2
.b8 0
.b8 55
.b8 56
.b8 49
.b8 56
.b8 55
.b8 56
.b8 48
.b8 56
.b8 54
.b8 46
.b8 112
.b8 121
.b8 0
.b32 .debug_line
.b8 47
.b8 116
.b8 109
.b8 112
.b8 47
.b8 105
.b8 112
.b8 121
.b8 107
.b8 101
.b8 114
.b8 110
.b8 101
.b8 108
.b8 95
.b8 50
.b8 48
.b8 57
.b8 50
.b8 55
.b8 51
.b8 52
.b8 0
.b64 $L__func_begin0
.b64 $L__func_end0
}
.section .debug_loc { }
I would like to have a rough CUDA C++ as follows (for understanding):
__global__ void kernel(float *input, float *output, int N) {
// Get the block and thread index
unsigned int blockId = blockIdx.x;
unsigned int threadId = threadIdx.x;
// Calculate global index based on block and thread indices
unsigned int globalIndex = (blockId << 7) | (threadId & 127); // (blockIdx.x << 7) + (threadIdx.x & 127)
// Perform bounds check to avoid out-of-bounds memory access
if (globalIndex < N) {
// Load value from input array at globalIndex
float value = input[globalIndex];
// Store the value into the output array at the same index
output[globalIndex] = value;
}
}
The above reverse engineering was done by ChatGPT.