Why will different numbers of thread blocks cause bank conflicts?

I have a problem with CUDA kernel shared memory bank conflicts when using the LDG.128 optimization.

When I store data from shared memory to global memory, I notice that within the same warp, the access pattern is fixed. However, as the number of thread blocks varies, the occurrence of bank conflicts fluctuates from none to some.

Here is some code using the broadcasting mechanism and LDG.128.The number of thread blocks here varies with the matrix size M.

The device used is an RTX 4060, and the compilation was done with -arch=sm_80.

#include <stdio.h>
#include <mma.h>
#include <cuda_fp16.h>
#include <cuda_profiler_api.h>
#include <stdint.h>
#include <cuda_runtime.h>
#include "assert.h"
#include "include/ldmatrix.cuh"
#include<iostream>
#include <cuda_fp16.h>


template<int VectorLength, int Block_M, int Block_K, int Block_N, int Warp_M, int Warp_K, int Warp_N, int Mma_M, int Mma_K, int Mma_N>
void __global__ OutPutPad(half *output_matrix, int m, int k, int n){

    int m_index = blockIdx.x * Block_M;
    int n_index = blockIdx.y * Block_N;

    int warp_id = threadIdx.x / 32;
    int warp_lane_id = threadIdx.x % 32;
    int warp_lane_id_8 = warp_lane_id % 8;
    int warp_group_id_8 = warp_lane_id / 8;

    // int num_warp = blockDim.x / 32;
    
    // const int warp_pad_row = 0;
    // constexpr int pad_row = 0;

    constexpr int c_tile_size = Block_M * Block_N;

    __shared__ half C_tile[c_tile_size];

    half *c_tile_ptr = C_tile;

    const int c_num_reg = (Warp_N / Mma_N) * (Warp_M / Mma_M) * 2;
    uint c_fragment[c_num_reg];

    const int num_iter = Warp_N / Mma_N;
    #pragma unroll
    for(int i = 0; i < num_iter; i++){
        uint4 *src = reinterpret_cast<uint4*>(c_tile_ptr + warp_id * Warp_M * Block_N);

        uint4 *dst = reinterpret_cast<uint4*>(output_matrix + warp_lane_id * 8);

        *dst = *src;
    }
    
}



void Main(){
    const int M = 128;
    const int K = 32;
    const int N = 4096;
    
    half *matrix_h = new half[M * K];
    for(int i = 0; i < M * K; i++){matrix_h[i] = i;}

    half *matrix_d;
    cudaMalloc(&matrix_d, M * K * sizeof(half));
    cudaMemcpy(matrix_d, matrix_h, M * K * sizeof(half), cudaMemcpyHostToDevice);

    const int V = 64;

    const int BM = 64;
    const int BK = 32;
    const int BN = 64;

    const int WM = 32;
    const int WK = 32;
    const int WN = 64;

    const int MM = 16;
    const int MK = 16;
    const int MN = 8;

    dim3 gridsize(M / BM, N / BN, 1);
    dim3 blocksize(32 * (BM / WM) * (BN / WN), 1, 1);

    printf("Matrix:(M, K, N)=(%d, %d, %d)\n", M, K, N);
    printf("Grid:(X, Y, Z)=(%d, %d, %d)\n", gridsize.x, gridsize.y, gridsize.z);
    printf("Block:(X, Y, Z)=(%d, %d, %d)\n", blocksize.x, blocksize.y, blocksize.z);

    OutPutPad<V, BM, BK, BN, WM, WK, WN, MM, MK, MN><<<gridsize, blocksize>>>(matrix_d, M, K, N);
    cudaDeviceSynchronize();
}



int main() {
    Main();
    return 0;
}

CMakeLists.txt:

cmake_minimum_required(VERSION 3.18)
project(SPATHA CUDA CXX C)

find_package(CUDA REQUIRED)
set(CMAKE_CUDA_ARCHITECTURES 80)
set(CMAKE_BUILD_TYPE Debug)
set(CMAKE_CUDA_FLAGS "-G -g")

set(CUDA_SOURCE_FILES_OUTPUTPAD OutptTile.cu)

add_executable(outputpad ${CUDA_SOURCE_FILES_OUTPUTPAD})

When M =128, no bank conflict occurs, but when M = 256, bank conflict occurs.
Here are the results from NCU.

sudo ncu --metric l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio,l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio,smsp__sass_average_data_bytes_per_sector_mem_global_op_ld.pct,sm__cycles_elapsed.avg,l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum,l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.avg,l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum,l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.avg  ./outputpad

M=128:

==PROF== Connected to process 73304 (/home/buwf/code/bank/build/outputpad)
==PROF== Profiling "OutPutPad" - 0: 0%....50%....100% - 6 passes
Matrix:(M, K, N)=(128, 32, 4096)
Grid:(X, Y, Z)=(2, 64, 1)
Block:(X, Y, Z)=(64, 1, 1)
==PROF== Disconnected from process 73304
[73304] outputpad@127.0.0.1
  void OutPutPad<(int)64, (int)64, (int)32, (int)64, (int)32, (int)32, (int)64, (int)16, (int)16, (int)8>(__half *, int, int, int) (2, 64, 1)x(64, 1, 1), Context 1, Stream 7, Device 0, CC 8.9
    Section: Command line profiler metrics
    -------------------------------------------------------------------- -------------- ------------
    Metric Name                                                             Metric Unit Metric Value
    -------------------------------------------------------------------- -------------- ------------
    l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request            0
    l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request        16.00
    l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.avg                                       0
    l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum                                       0
    l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.avg                                       0
    l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum                                       0
    sm__cycles_elapsed.avg                                                        cycle    18,237.33
    smsp__sass_average_data_bytes_per_sector_mem_global_op_ld.pct                     %          inf
    -------------------------------------------------------------------- -------------- ------------

M=256

==PROF== Connected to process 73707 (/home/buwf/code/bank/build/outputpad)
==PROF== Profiling "OutPutPad" - 0: 0%....50%....100% - 6 passes
Matrix:(M, K, N)=(256, 32, 4096)
Grid:(X, Y, Z)=(4, 64, 1)
Block:(X, Y, Z)=(64, 1, 1)
==PROF== Disconnected from process 73707
[73707] outputpad@127.0.0.1
  void OutPutPad<(int)64, (int)64, (int)32, (int)64, (int)32, (int)32, (int)64, (int)16, (int)16, (int)8>(__half *, int, int, int) (4, 64, 1)x(64, 1, 1), Context 1, Stream 7, Device 0, CC 8.9
    Section: Command line profiler metrics
    -------------------------------------------------------------------- -------------- ------------
    Metric Name                                                             Metric Unit Metric Value
    -------------------------------------------------------------------- -------------- ------------
    l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/request            0
    l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_st.ratio sector/request        16.00
    l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.avg                                    6.08
    l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum                                     146
    l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.avg                                       0
    l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum                                       0
    sm__cycles_elapsed.avg                                                        cycle       19,635
    smsp__sass_average_data_bytes_per_sector_mem_global_op_ld.pct                     %          inf
    -------------------------------------------------------------------- -------------- ------------

I’m not sure if the issue is with my access pattern or something else. Is there a method or tool that can help me identify the cause of the bank conflicts?

shared memory bank conflicts are not affected by the number of thread blocks you run

shared memory bank conflicts cannot be properly assessed purely by looking at a metric. It requires analysis.