Arguments mismatch for instruction 'mma', why?

#include <mma.h>
#include <iostream>
#include <stdio.h>
__global__ void mma_tf32_acc_fp32(float *out) {
    float cc[4] = {0., 1., 2., 3.}; 
    float dd[4] = {0., 0., 0., 0.};
    float aa[2] = {0., 0.};
    float bb = 1.;
    asm volatile(
      "mma.sync.aligned.m16n8k4.row.col.f32.tf32.tf32.f32 "
      "{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
      : "=f"(dd[0]), "=f"(dd[1]), "=f"(dd[2]), "=f"(dd[3])
      :
        "f"(aa[0]), "f"(aa[1]),
        "f"(bb),
        "f"(cc[0]), "f"(cc[1]), "f"(cc[2]), "f"(cc[3])
    );
}
int main() {
    float* h_C = (float*)malloc(16*8*sizeof(float));
    float* d_C;
    cudaMalloc(&d_C, 16*8*sizeof(float));
    mma_tf32_acc_fp32<<<1, 32>>>(d_C);
    cudaDeviceSynchronize();
    cudaMemcpy(h_C, d_C, 16*8*sizeof(float), cudaMemcpyDeviceToHost);
    for (int i = 0; i < 16; i++){
      for (int j = 0; j < 8; j++) std::cout << h_C[i*8+j] << " ";
      std::cout << std::endl;}
}

This is my code, and the compile command is:

nvcc delete.cu -o delete -arch=sm_86 -std=c++17

why it has error:

ptxas C:/Users/hzy/AppData/Local/Temp/tmpxft_00006d78_00000000-10_delete5.ptx, line 27; error   : Arguments mismatch for instruction 'mma'
ptxas C:/Users/hzy/AppData/Local/Temp/tmpxft_00006d78_00000000-10_delete5.ptx, line 27; error   : Arguments mismatch for instruction 'mma'
ptxas fatal   : Ptx assembly aborted due to errors

“f” means the input is a .f32 register. But tf32 must be passed as .b32 register. You need to convert from fp32 to tf32 beforehand.

For example, this compiles without ptx warning. You may want to use other tf32 conversions, I just picked one.

__global__ void mma_tf32_acc_fp32(float *out) {
    float cc[4] = {0., 1., 2., 3.}; 
    float dd[4] = {0., 0., 0., 0.};
    float aa[2] = {0., 0.};
    float bb = 1.;
    asm volatile(
      ".reg .b32 %Ra<2>, %Rb<1>;\n\t"
      "cvt.rna.tf32.f32 %Ra0, %8; \n\t"
      "cvt.rna.tf32.f32 %Ra1, %9; \n\t"
      "cvt.rna.tf32.f32 %Rb0, %10; \n\t"
      "mma.sync.aligned.m16n8k4.row.col.f32.tf32.tf32.f32 "
      "{%0,%1,%2,%3}, {%Ra0,%Ra1}, {%Rb0}, {%4,%5,%6,%7};\n"
      : "=f"(dd[0]), "=f"(dd[1]), "=f"(dd[2]), "=f"(dd[3])
      :
        "f"(cc[0]), "f"(cc[1]), "f"(cc[2]), "f"(cc[3]),
        "f"(aa[0]), "f"(aa[1]), "f"(bb)
    );
}
1 Like

Thank you!! Your solution works! But I tried this and does not use tf32 format as input, but use tf32 as command, still work… why?

#include <mma.h>
#include <iostream>
#include <stdio.h>
__global__ void mma_tf32_acc_fp32(float *out) {
    float cc[4] = {0., 1., 2., 3.}; 
    float dd[4] = {0., 0., 0., 0.};
    float aa[2] = {0., 0.};
    float bb = 1.;

    uint32_t const *A = reinterpret_cast<uint32_t const *>(&aa);
    uint32_t const *B = reinterpret_cast<uint32_t const *>(&bb);

    asm volatile(
      "mma.sync.aligned.m16n8k4.row.col.f32.tf32.tf32.f32 "
      "{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
      : "=f"(dd[0]), "=f"(dd[1]), "=f"(dd[2]), "=f"(dd[3])
      :
        "r"(A[0]), "r"(A[1]),
        "r"(B[0]),
        "f"(cc[0]), "f"(cc[1]), "f"(cc[2]), "f"(cc[3])
    );
}
int main() {
    float* h_C = (float*)malloc(16*8*sizeof(float));
    float* d_C;
    cudaMalloc(&d_C, 16*8*sizeof(float));
    mma_tf32_acc_fp32<<<1, 32>>>(d_C);
    cudaDeviceSynchronize();
    cudaMemcpy(h_C, d_C, 16*8*sizeof(float), cudaMemcpyDeviceToHost);
    for (int i = 0; i < 16; i++){
      for (int j = 0; j < 8; j++) std::cout << h_C[i*8+j] << " ";
      std::cout << std::endl;}
}

I don’t know. But if tf32 and float have different binary representations, a simple binary cast will produce wrong results.

TF32 and float should have the same storage format (the definition of the meaning of each bit and its location). If you only intend to deal with TF32 data, it should be sufficient to store it as float and then “cast”. (no actual cast should be needed).

However this does not mean that a conversion from an actual float quantity to a TF32 quantity can simply be done via reinterpretation, as this would not take into account rounding.

1 Like

Thank you very much for your kind suggestion!!!

Well…I am actually calculating uint32_t format using tensor core now… You know, there is no uint32_t in tensor core… I am considering maybe using two tf32 or fp64 to somehow express uint32_t…

What do you think?

Well, I tested, my solution using uint32_t to reinterpret float but using asm tf32 will lead to incorrect result. Your solution is correct! Thanks!

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.