Hello, I am conducting a microbenchmark to test how much utilization can be achieved when using both CUDA cores and Tensor cores simultaneously within a single GPU kernel. First, I’m trying to assess the maximum utilization of Tensor core operations. (The device I’m using is RTX3090)
#define TENSOR_ITER 8192
#define SHARED_MEM_SIZE 12
extern "C" __global__ void mma_fp16_acc_fp32(float *out) {
uint32_t tid = threadIdx.x;
// FP16 Shared memory
__shared__ half shared_A[SHARED_MEM_SIZE];
__shared__ half shared_B[SHARED_MEM_SIZE];
float frag_d[12];
if (tid < SHARED_MEM_SIZE){
shared_A[tid] = const_A[0];
}
if ((tid >= SHARED_MEM_SIZE) && (tid < SHARED_MEM_SIZE*2)){
shared_B[tid] = const_A[0];
frag_d[tid] = 0.0f;
}
// Synchronize all threads within TB
__syncthreads();
uint32_t const *A = reinterpret_cast<uint32_t const *>(&shared_A[0]);
uint32_t const *B = reinterpret_cast<uint32_t const *>(&shared_B[0]);
float *C = reinterpret_cast<float *>(&frag_d[0]);
float *D = C;
for (unsigned int i = 0; i < TENSOR_ITER; ++i){
asm volatile(
"mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 "
"{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
: "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
: "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]),
"r"(B[0]), "r"(B[1]),
"f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3])
);
asm volatile(
"mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 "
"{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
: "=f"(D[4]), "=f"(D[5]), "=f"(D[6]), "=f"(D[7])
: "r"(A[4]), "r"(A[5]), "r"(A[6]), "r"(A[7]),
"r"(B[2]), "r"(B[3]),
"f"(C[4]), "f"(C[5]), "f"(C[6]), "f"(C[7])
);
asm volatile(
"mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 "
"{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
: "=f"(D[8]), "=f"(D[9]), "=f"(D[10]), "=f"(D[11])
: "r"(A[8]), "r"(A[9]), "r"(A[10]), "r"(A[11]),
"r"(B[4]), "r"(B[5]),
"f"(C[8]), "f"(C[9]), "f"(C[10]), "f"(C[11])
);
__syncwarp();
}
if (tid == 0){
out[blockIdx.x] = D[0];
}
}
As above code, To maximize Tensor core utilization, I wrote a kernel that explicitly inserts Tensor core operations into PTX assembly, ensuring that Tensor cores are used as much as possible.
However, no matter how much I increase the number of iterations and minimize memory access, the Nsight Compute metric sm__pipe_tensor_cycles_active.avg.pct_of_peak_sustained_active only reaches a maximum value of 50%.
Here is the profiled result.
I am wondering whether this value is the hardware-imposed upper limit or if there is a way to further increase the measured metric.