#include <mma.h>
#include <iostream>
#include <stdio.h>
__global__ void mma_tf32_acc_fp32(float *out) {
float cc[4] = {0., 1., 2., 3.};
float dd[4] = {0., 0., 0., 0.};
float aa[2] = {0., 0.};
float bb = 1.;
asm volatile(
"mma.sync.aligned.m16n8k4.row.col.f32.tf32.tf32.f32 "
"{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
: "=f"(dd[0]), "=f"(dd[1]), "=f"(dd[2]), "=f"(dd[3])
:
"f"(aa[0]), "f"(aa[1]),
"f"(bb),
"f"(cc[0]), "f"(cc[1]), "f"(cc[2]), "f"(cc[3])
);
}
int main() {
float* h_C = (float*)malloc(16*8*sizeof(float));
float* d_C;
cudaMalloc(&d_C, 16*8*sizeof(float));
mma_tf32_acc_fp32<<<1, 32>>>(d_C);
cudaDeviceSynchronize();
cudaMemcpy(h_C, d_C, 16*8*sizeof(float), cudaMemcpyDeviceToHost);
for (int i = 0; i < 16; i++){
for (int j = 0; j < 8; j++) std::cout << h_C[i*8+j] << " ";
std::cout << std::endl;}
}
This is my code, and the compile command is:
nvcc delete.cu -o delete -arch=sm_86 -std=c++17
why it has error:
ptxas C:/Users/hzy/AppData/Local/Temp/tmpxft_00006d78_00000000-10_delete5.ptx, line 27; error : Arguments mismatch for instruction 'mma'
ptxas C:/Users/hzy/AppData/Local/Temp/tmpxft_00006d78_00000000-10_delete5.ptx, line 27; error : Arguments mismatch for instruction 'mma'
ptxas fatal : Ptx assembly aborted due to errors