Hi ~
How to understand the L1 Achieved Value in the Floating Point Operations Roofline model(Single Precision)?
In my understanding, it is calculated by dividing flops by number of bytes requested from L1/TEX cache.
In this case, I thought it should be 68719476736(smsp__sass_thread_inst_executed_op_fmul_pred_on.sum [inst])/274877906944(bytes requested from L1/TEX cache)=0.25, but ncu shows 0.12.
Could anyone help point out where my understanding is wrong?
and my code is very simple
// M = N = K = 4096
__global__ void matrix_multiplication_kernel_base(const float *A,
const float *B, float *C,
int M, int N, int K) {
int k = blockIdx.x * blockDim.x + threadIdx.x;
int m = blockIdx.y * blockDim.y + threadIdx.y;
if (k >= K || m >= M) {
return;
}
A += m * N;
B += k;
double sum = 0.0;
for (int l = 0; l < N; ++l) {
sum += A[l] * B[l * K];
}
C[m * K + k] = sum;
}
void solve_base(const float *A, const float *B, float *C, int M, int N, int K) {
dim3 threads_per_block(16, 16);
dim3 blocks_per_grid((K + threads_per_block.x - 1) / threads_per_block.x,
(M + threads_per_block.y - 1) / threads_per_block.y);
only_A<<<blocks_per_grid, threads_per_block>>>(A, C, M, N, K);
only_B<<<blocks_per_grid, threads_per_block>>>(B, C, M, N, K);
matrix_multiplication_kernel_base<<<blocks_per_grid, threads_per_block>>>(
A, B, C, M, N, K);
CHECK_CUDA_ERROR();
cudaDeviceSynchronize();
}