I think the problem hasn’t been completely resolved even with #pragma unroll
. Here’s the specific code for matrix multiplication. When k = 128
and 1280
, the result is correct. However, when k = 12800
, the output is all zeros. The debugging outputs in the error message are also all zeros. I’m very confused about this and I hope to get your help.
#include <iostream>
#include <cuda_fp16.h>
#include <mma.h>
#include <cuda_fp16.h>
#include <mma.h>
#include <cuda.h>
#define BLOCK_SIZE 16
using namespace nvcuda;
__device__ void Load_SmemA(int lane_id,
int m, int n,
int d,
int step,
half *value,
half *smem_value)
{
int tt = (m * n + blockDim.x * blockDim.y - 1) / (blockDim.x * blockDim.y);
for (int k = 0; k < tt; k++)
{
int temp = lane_id + k * blockDim.x * blockDim.y;
int x = temp / n, y = temp % n;
smem_value[x * n + y] = value[x * d + y + step * n];
__syncthreads();
}
__syncthreads();
}
__device__ void Load_SmemB(int lane_id,
int m, int n,
int d,
int step,
half *value,
half *smem_value)
{
int tt = (m * n + blockDim.x * blockDim.y - 1) / (blockDim.x * blockDim.y);
for (int k = 0; k < tt; k++)
{
int temp = lane_id + k * blockDim.x * blockDim.y;
int x = temp / n, y = temp % n;
smem_value[x * n + y] = value[(x + step * m) * d + y];
}
}
__device__ __forceinline__ uint32_t cvta_to_shared_u32(const void *pointer)
{
uint32_t address;
asm("{\n\t"
" .reg .u64 u64addr;\n\t"
" cvta.to.shared.u64 u64addr, %1;\n\t"
" cvt.u32.u64 %0, u64addr;\n\t"
"}"
: "=r"(address)
: "l"(pointer));
return address;
}
__device__ void Load_FragA(int lane_id,
int group_warp,
half *value,
uint32_t *frag)
{
// uint32_t shmem_A_lane_addr = __cvta_generic_to_shared(value) + ((lane_id % 16) * 8) * sizeof(uint16_t);
uint32_t shmem_A_lane_addr = __cvta_generic_to_shared(value) + ((lane_id % 16) * 8) * sizeof(half);
// printf("%.2f \n", __half2float(value[(lane_id % 16) * 8]));
asm(
"ldmatrix.sync.aligned.x2.m8n8.shared.b16 "
"{%0,%1}, [%2]; "
: "=r"(frag[0]), "=r"(frag[1])
: "r"(shmem_A_lane_addr));
}
__device__ void Load_FragB(int lane_id,
int group_warp,
half *value,
uint32_t *frag)
{
if (group_warp == 0)
{
uint32_t shmem_A_lane_addr = __cvta_generic_to_shared(value) + ((lane_id % 8) * 16 + 8 * group_warp) * sizeof(half);
asm(
"ldmatrix.sync.aligned.x1.trans.m8n8.shared.b16 "
"{%0}, [%1]; "
: "=r"(frag[group_warp])
: "r"(shmem_A_lane_addr));
}
else
{
uint32_t shmem_A_lane_addr = __cvta_generic_to_shared(value) + ((lane_id % 8) * 16 + 8 * group_warp) * sizeof(half);
asm(
"ldmatrix.sync.aligned.x1.trans.m8n8.shared.b16 "
"{%0}, [%1]; "
: "=r"(frag[group_warp])
: "r"(shmem_A_lane_addr));
}
}
__device__ void compute(int lane_id,
int group_warp,
uint32_t *fragA,
uint32_t *fragB,
uint32_t *fragC)
{
if (group_warp == 0)
{
asm("mma.sync.aligned.m16n8k8.row.col.f16.f16.f16.f16 "
" {%0, %1},"
" {%2, %3},"
" {%4},"
" {%5, %6};\n"
: "=r"(fragC[0]), "=r"(fragC[1])
: "r"(fragA[0]), "r"(fragA[1]),
"r"(fragB[0]),
"r"(fragC[0]), "r"(fragC[1]));
}
else
{
asm("mma.sync.aligned.m16n8k8.row.col.f16.f16.f16.f16 "
" {%0, %1},"
" {%2, %3},"
" {%4},"
" {%5, %6};\n"
: "=r"(fragC[2]), "=r"(fragC[3])
: "r"(fragA[0]), "r"(fragA[1]),
"r"(fragB[1]),
"r"(fragC[2]), "r"(fragC[3]));
}
}
__device__ void Store_SmemC(int lane_id,
int group_warp,
half *value,
uint32_t *frag)
{
int x = lane_id / 4, y = lane_id % 4;
*(uint32_t *)(&value[x * 16 + 2 * y + 8 * group_warp]) = frag[group_warp * 2];
*(uint32_t *)(&value[(x + 8) * 16 + 2 * y + 8 * group_warp]) = frag[group_warp * 2 + 1];
}
// m16n8k8
// CUDA核函数,利用Tensor Core执行矩阵乘法
template <int m, int n, int k>
__global__ void matrixMultiply(half *A, half *B, half *C)
{
int lane_id = (threadIdx.y * blockDim.x + threadIdx.x) % 32;
int group_warp = (threadIdx.y * blockDim.x + threadIdx.x) / 32;
__shared__ half smem_A[16 * 8];
__shared__ half smem_B[16 * 8];
__shared__ half smem_C[16 * 16];
int threadId = threadIdx.y * blockDim.x + threadIdx.x;
uint32_t fragA[2];
uint32_t fragB[2];
uint32_t acc[4] = {0};
#pragma unroll
for (int i = 0; i < k / 8; i++)
{
Load_SmemA(threadId, 16, 8, k, i, A, smem_A);
Load_SmemB(threadId, 8, 16, n, i, B, smem_B);
__syncthreads();
Load_FragA(lane_id, group_warp, smem_A, fragA);
Load_FragB(lane_id, group_warp, smem_B, fragB);
compute(lane_id, group_warp, fragA, fragB, acc);
if (threadId == 15 && i == 0)
{
half firstHalf = *reinterpret_cast<half *>(&fragA[1]);
uint32_t shiftedValue = fragA[1] >> 16;
half secondHalf = *reinterpret_cast<half *>(&shiftedValue);
printf("%f %f \n", __half2float(firstHalf), __half2float(secondHalf));
}
__syncthreads();
}
Store_SmemC(lane_id, group_warp, smem_C, acc);
__syncthreads();
if (threadId == 0)
{
for (int ii = 0; ii < 16; ii++)
{
for (int jj = 0; jj < 16; jj++)
{
printf("%f ", __half2float(smem_C[ii * 16 + jj]));
}
printf("\n");
}
}
}
int main()
{
// int k = 128 1280 12800
int m = 16, n = 16, k = 12800;
half *h_A = new half[m * k];
half *h_B = new half[k * n];
half *h_C = new half[m * n];
for (int i = 0; i < m; i++)
{
for (int j = 0; j < k; j++)
{
h_A[i * k + j] = (half)(i * 0.1);
}
}
for (int i = 0; i < k; i++)
{
for (int j = 0; j < n; j++)
{
h_B[i * n + j] = (half)(j * 0.1);
}
}
for (int i = 0; i < m; i++)
{
for (int j = 0; j < n; j++)
{
h_C[i * n + j] = (half)(0.0f);
}
}
half *d_A, *d_B;
half *d_C;
cudaMalloc((void **)&d_A, m * k * sizeof(half));
cudaMalloc((void **)&d_B, k * n * sizeof(half));
cudaMalloc((void **)&d_C, m * n * sizeof(half));
cudaMemcpy(d_A, h_A, m * k * sizeof(half), cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, k * n * sizeof(half), cudaMemcpyHostToDevice);
dim3 blockDim(1, 64);
dim3 gridDim(1, 1);
matrixMultiply<16, 16, 12800><<<gridDim, blockDim>>>(d_A, d_B, d_C);
cudaDeviceSynchronize();
cudaMemcpy(h_C, d_C, m * n * sizeof(half), cudaMemcpyDeviceToHost);
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
delete[] h_A;
delete[] h_B;
delete[] h_C;
return 0;
}```