I have searched on google, but no answer yet… Maybe I can create it just as float, and then use
mma.sync.aligned.m16n8k8.row.col.tf32.tf32.f32.f32
It will automatically read data as tf32, even if my input is float, or int?
Thank you!!!
Is there anything like this:
tf32 dd[4] = {0., 0., 0., 0.};
(I have tried, but errored…)(Or maybe I should create as float, and reinterpret it as tf32 every time like __float_to_tf32
Considered below example, what can I write?
$ cat t10.cu
#include <mma.h>
#include <cuda_fp16.h>
#include <iostream>
#include <stdio.h>
__global__ void mma_fp16_acc_fp32(float *out) {
float c[4] = {0., 0., 0., 0.};
float d[4] = {0., 0., 0., 0.};
half a[4] = {1., 1., 1., 1.};
half b[2] = {1., 1.};
// the above would set our input matrices to all 1
// now lets modify some values
if (threadIdx.x%4 == 0) {
// set the first column of A to be 0, 1, 2, 3, ... 15
a[0] = threadIdx.x/4; a[2] = threadIdx.x/4 + 8;
// set the second row of B to 3,3,3, ... 3
b[1] = 3;}
unsigned const *A = reinterpret_cast<unsigned const *>(&a);
unsigned const *B = reinterpret_cast<unsigned const *>(&b);
float const *C = reinterpret_cast<float const *>(&c);
float *D = reinterpret_cast<float *>(&d);
asm(
"mma.sync.aligned.m16n8k4.row.col.f32.tf32.tf32.f32 "
"{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
: "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
:
"r"(A[0]), "r"(A[1]),
"r"(B[0]),
"f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3])
);
memcpy(out+threadIdx.x*2, D, 8);
memcpy(out+8*8+threadIdx.x*2, D+2, 8);
}
int main() {
float* h_C = (float*)malloc(16*8*sizeof(float));
float* d_C;
cudaMalloc(&d_C, 16*8*sizeof(float));
mma_fp16_acc_fp32<<<1, 32>>>(d_C);
cudaDeviceSynchronize();
cudaMemcpy(h_C, d_C, 16*8*sizeof(float), cudaMemcpyDeviceToHost);
for (int i = 0; i < 16; i++){
for (int j = 0; j < 8; j++) std::cout << h_C[i*8+j] << " ";
std::cout << std::endl;}
}
$ nvcc -o t10 t10.cu -arch=sm_75
$ cuda-memcheck ./t10
========= CUDA-MEMCHECK
9 9 9 9 9 9 9 9
10 10 10 10 10 10 10 10
11 11 11 11 11 11 11 11
12 12 12 12 12 12 12 12
13 13 13 13 13 13 13 13
14 14 14 14 14 14 14 14
15 15 15 15 15 15 15 15
16 16 16 16 16 16 16 16
17 17 17 17 17 17 17 17
18 18 18 18 18 18 18 18
19 19 19 19 19 19 19 19
20 20 20 20 20 20 20 20
21 21 21 21 21 21 21 21
22 22 22 22 22 22 22 22
23 23 23 23 23 23 23 23
24 24 24 24 24 24 24 24
========= ERROR SUMMARY: 0 errors
$