Here is my minimal example: (doesn’t work)
// tensor_mma_kernel.ptx
.version 8.4
.target sm_89
.address_size 64
// Entry point for the kernel
.entry tensor_mma_kernel(
.param.u64.ptr.global.align 16 A_ptr, // Pointer to matrix A in global memory
.param.u64.ptr.global.align 16 B_ptr, // Pointer to matrix B in global memory
.param.u64.ptr.global.align 16 C_ptr // Pointer to matrix C in global memory
)
{
// Declare registers for pointers
.reg .u64 %ra, %rb, %rc; // Base pointers to A, B, C
.reg .u64 %rb0, %rb1; // Addresses for elements in B
.reg .f16 %Ra0, %Ra1, %Ra2, %Ra3; // f16 data for A
.reg .f16 %Rb0, %Rb1; // f16 data for B
.reg .f32 %Rc0, %Rc1, %Rc2, %Rc3; // Accumulators for C
.reg .f32 %Rd0, %Rd1, %Rd2, %Rd3; // Results for D
// Load pointers from parameters
ld.param.u64 %ra, [A_ptr];
ld.param.u64 %rb, [B_ptr];
ld.param.u64 %rc, [C_ptr];
// Initialize accumulators for matrix C to 0.0f
mov.f32 %Rc0, 0.0;
mov.f32 %Rc1, 0.0;
mov.f32 %Rc2, 0.0;
mov.f32 %Rc3, 0.0;
// Load raw A data from global memory as u16 and convert to f16
ld.global.b16 %Ra0, [%ra];
ld.global.b16 %Ra1, [%ra + 2];
ld.global.b16 %Ra2, [%ra + 4];
ld.global.b16 %Ra3, [%ra + 6];
// Load raw B data from global memory as u16 and convert to f16
ld.global.b16 %Rb0, [%rb];
ld.global.b16 %Rb1, [%rb + 2];
// MMA operation: D = A * B + C
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 // this is line 42
{%Rd0, %Rd1, %Rd2, %Rd3},
{%Ra0, %Ra1, %Ra2, %Ra3},
{%Rb0, %Rb1},
{%Rc0, %Rc1, %Rc2, %Rc3};
// Store the result back to global memory
st.global.f32 [%rc + 0], %Rd0;
st.global.f32 [%rc + 4], %Rd1;
st.global.f32 [%rc + 8], %Rd2;
st.global.f32 [%rc + 12], %Rd3;
// Exit the kernel
exit;
}
when I run:
ptxas -arch=sm_89 tensor_mma_kernel.ptx -o tensor_mma_kernel.cubin
I get:
ptxas tensor_mma_kernel.ptx, line 42; error : Arguments mismatch for instruction ‘mma’
ptxas fatal : Ptx assembly aborted due to errors
What am I doing wrong?