Why there is random bank conflicts?

Hi,

I am quit confused with shared memory bank conflicts. I found my code sometimes have bank conflicts, some times the bank conflicts is 0.

my code is tested on jetson tx2

#include <stdlib.h>
#include <stdio.h>

#include <cuda.h>
#include <helper_cuda.h>

#define CHANNEL_TOTAL (32)

__global__ void sharedMemoryTest(float* in)
{
    __shared__ float data[CHANNEL_TOTAL];
    //printf("threadId = %d\r\n", threadIdx.x);
    data[threadIdx.x] = in[threadIdx.x];

    printf("[%d] = %f\r\n", threadIdx.x, data[threadIdx.x]);
}

__global__ void sharedMemoryTest_1(float* in)
{
    __shared__ float data[CHANNEL_TOTAL * 17];
    //printf("threadId = %d\r\n", threadIdx.x);
    //data[threadIdx.x] = in[threadIdx.x];

    int tid = threadIdx.x;
    int row = tid / 16;
    int column = tid % 16;
    data[row * 17 + column] = in[tid];

    float number = data[17 * tid];
    printf("[%d] = %f\r\n", tid, number);
}

__global__ void sharedMemoryTest_2(float* in)
{
    __shared__ float data[CHANNEL_TOTAL * 33];
    //printf("threadId = %d\r\n", threadIdx.x);
    //data[threadIdx.x] = in[threadIdx.x];

    int tid = threadIdx.x;
    int row = tid / 32;
    int column = tid % 32;
    data[row * 33 + column] = in[tid];

    float number = data[row * 33 + column];
    printf("[%d] = %f\r\n", tid, number);
}

int main()
{
    int times = 100;
    //float* data = (float*)malloc(CHANNEL_TOTAL * sizeof (float));
    float* data = nullptr;
    checkCudaErrors(cudaMallocHost(&data, CHANNEL_TOTAL * sizeof (float), cudaHostAllocMapped));
    for (int i = 0; i < CHANNEL_TOTAL; i++) {
        data[i] = i;
    }

    cudaDeviceSetLimit(cudaLimitMallocHeapSize, 128*1024*1024);

    for (int i = 0; i < times; i++) {
        sharedMemoryTest<<<640 * 480 / CHANNEL_TOTAL, CHANNEL_TOTAL>>>(data);
        sharedMemoryTest_1<<<640 * 480 / CHANNEL_TOTAL, CHANNEL_TOTAL>>>(data);
        sharedMemoryTest_2<<<640 * 480 / CHANNEL_TOTAL, CHANNEL_TOTAL>>>(data);
    }
    cudaDeviceSynchronize();
    return 0;
}

and the nvprof result:

Invocations                                Event Name         Min         Max         Avg       Total
Device "NVIDIA Tegra X2 (0)"
    Kernel: sharedMemoryTest_1(float*)
        100                   shared_ld_bank_conflict           0        2088         511       51128
        100                   shared_st_bank_conflict        9600       12008       10189     1018962
    Kernel: sharedMemoryTest_2(float*)
        100                   shared_ld_bank_conflict           0        2088         469       46943
        100                   shared_st_bank_conflict           0        2408         541       54146
    Kernel: sharedMemoryTest(float*)
        100                   shared_ld_bank_conflict           0        2088         500       50098
        100                   shared_st_bank_conflict           0        2408         577       57778

==29128== Metric result:
Invocations                               Metric Name                             Metric Description         Min         Max         Avg
Device "NVIDIA Tegra X2 (0)"
    Kernel: sharedMemoryTest_1(float*)
        100                             sm_efficiency                        Multiprocessor Activity      99.79%     100.00%      99.95%
        100                        achieved_occupancy                             Achieved Occupancy    0.436820    0.437105    0.436885
        100                            gld_throughput                         Global Load Throughput  10.271MB/s  10.588MB/s  10.430MB/s
        100                            gst_throughput                        Global Store Throughput  16.707GB/s  17.213GB/s  16.958GB/s
        100                            gld_efficiency                  Global Memory Load Efficiency     100.00%     100.00%     100.00%
        100                            gst_efficiency                 Global Memory Store Efficiency       0.00%       0.00%       0.00%
        100                          gld_transactions                       Global Load Transactions     1689602     2203770     1760970
        100                          gst_transactions                      Global Store Transactions    63926400    63958432    63935042
        100              gld_transactions_per_request           Global Load Transactions Per Request   24.999832   68.469110   55.794881
        100              gst_transactions_per_request          Global Store Transactions Per Request   29.860987   29.875949   29.865024
        100                         branch_efficiency                              Branch Efficiency      99.88%      99.88%      99.88%
        100                         shared_efficiency                       Shared Memory Efficiency      57.66%      66.67%      64.36%
        100                    shared_load_throughput                  Shared Memory Load Throughput  10.271MB/s  13.174MB/s  11.084MB/s
        100                   shared_store_throughput                 Shared Memory Store Throughput  20.543MB/s  23.355MB/s  21.426MB/s
        100     shared_store_transactions_per_request   Shared Memory Store Transactions Per Request    2.000000    2.217500    2.054359
        100                     local_load_throughput                   Local Memory Load Throughput  10.767GB/s  11.092GB/s  10.929GB/s
        100                    local_store_throughput                  Local Memory Store Throughput  1.2369GB/s  1.2763GB/s  1.2536GB/s
        100                        tex_cache_hit_rate                         Unified Cache Hit Rate      35.72%      35.86%      35.78%
        100                    tex_cache_transactions                     Unified Cache Transactions    22150027    22289874    22183257
    Kernel: sharedMemoryTest_2(float*)
        100                             sm_efficiency                        Multiprocessor Activity      99.87%     100.00%      99.97%
        100                        achieved_occupancy                             Achieved Occupancy    0.234150    0.234189    0.234166
        100                            gld_throughput                         Global Load Throughput  11.305MB/s  11.457MB/s  11.391MB/s
        100                            gst_throughput                        Global Store Throughput  18.383GB/s  18.630GB/s  18.520GB/s
        100                            gld_efficiency                  Global Memory Load Efficiency     100.00%     100.00%     100.00%
        100                            gst_efficiency                 Global Memory Store Efficiency       0.00%       0.00%       0.00%
        100                          gld_transactions                       Global Load Transactions     1689602     1768610     1707618
        100                          gst_transactions                      Global Store Transactions    63926400    63951240    63930730
        100              gld_transactions_per_request           Global Load Transactions Per Request   58.666736   61.410069   59.292317
        100              gst_transactions_per_request          Global Store Transactions Per Request   29.860987   29.872590   29.863009
        100                         branch_efficiency                              Branch Efficiency      99.88%      99.88%      99.88%
        100                         shared_efficiency                       Shared Memory Efficiency      68.10%     100.00%      94.04%
        100                    shared_load_throughput                  Shared Memory Load Throughput  0.00000B/s  2.8625MB/s  482.50KB/s
        100                   shared_store_throughput                 Shared Memory Store Throughput  11.305MB/s  13.894MB/s  11.799MB/s
        100     shared_store_transactions_per_request   Shared Memory Store Transactions Per Request    1.000000    1.217500    1.035874
        100                     local_load_throughput                   Local Memory Load Throughput  11.844GB/s  12.003GB/s  11.934GB/s
        100                    local_store_throughput                  Local Memory Store Throughput  1.3592GB/s  1.3775GB/s  1.3687GB/s
        100                        tex_cache_hit_rate                         Unified Cache Hit Rate      36.32%      36.37%      36.35%
        100                    tex_cache_transactions                     Unified Cache Transactions    22151709    22179648    22157582
    Kernel: sharedMemoryTest(float*)
        100                             sm_efficiency                        Multiprocessor Activity      99.86%     100.00%      99.96%
        100                        achieved_occupancy                             Achieved Occupancy    0.499137    0.499276    0.499197
        100                            gld_throughput                         Global Load Throughput  10.265MB/s  10.517MB/s  10.417MB/s
        100                            gst_throughput                        Global Store Throughput  16.693GB/s  17.098GB/s  16.937GB/s
        100                            gld_efficiency                  Global Memory Load Efficiency     100.00%     100.00%     100.00%
        100                            gst_efficiency                 Global Memory Store Efficiency       0.00%     635.75%       6.36%
        100                          gld_transactions                       Global Load Transactions     1689602     1807018     1714552
        100                          gst_transactions                      Global Store Transactions    63926400    63946624    63931857
        100              gld_transactions_per_request           Global Load Transactions Per Request   58.666736   62.743681   59.533088
        100              gst_transactions_per_request          Global Store Transactions Per Request   29.860987   29.870433   29.863536
        100                         branch_efficiency                              Branch Efficiency      99.88%      99.88%      99.88%
        100                         shared_efficiency                       Shared Memory Efficiency      68.10%     100.00%      91.10%
        100                    shared_load_throughput                  Shared Memory Load Throughput  0.00000B/s  2.6254MB/s  695.48KB/s
        100                   shared_store_throughput                 Shared Memory Store Throughput  10.293MB/s  12.743MB/s  11.006MB/s
        100     shared_store_transactions_per_request   Shared Memory Store Transactions Per Request    1.000000    1.217500    1.056539
        100                     local_load_throughput                   Local Memory Load Throughput  10.757GB/s  11.018GB/s  10.915GB/s
        100                    local_store_throughput                  Local Memory Store Throughput  1.2329GB/s  1.2674GB/s  1.2527GB/s
        100                        tex_cache_hit_rate                         Unified Cache Hit Rate      35.52%      35.61%      35.57%
        100                    tex_cache_transactions                     Unified Cache Transactions    22150995    22194901    22163492

Thanks.

sorry for reopen this topic.
i meet a similar problem, and i wonder why this happens
my transpose kernel has unexpected number of bank conflect

template <typename T>
__global__ void ktranspose_smem_nbkcft(size_t m, size_t n, T *I, T *O) {
    assert(blockDim.x == blockDim.y && blockDim.z == 1);
    extern __shared__ T tile[];
    size_t x = blockIdx.x * blockDim.x + threadIdx.x;
    size_t y = blockIdx.y * blockDim.y + threadIdx.y;

    tile[threadIdx.x * (blockDim.y + 1) + threadIdx.y] = (x < n && y < m) ? I[y * n + x] : 0;
    __syncthreads();

    x = blockIdx.y * blockDim.y + threadIdx.x;
    y = blockIdx.x * blockDim.x + threadIdx.y;

    if (x < m && y < n)
        O[y * m + x] = tile[threadIdx.y * (blockDim.x + 1) + threadIdx.x];
}

launch_config always

    int TILE_DIM = 32;
    dim3 BLOCK(TILE_DIM, TILE_DIM);
    dim3 GRID(
        n + TILE_DIM - 1 / TILE_DIM,
        m + TILE_DIM - 1 / TILE_DIM
    );
    shared_mem = TILE_DIM * (TILE_DIM + 1) * sizeof(float);

<<<GRID, BLOCK, shared_mem>>>

i check
nsight compute →
memory workload analysis →
shared memory →
row of shared load and
column of bank conflicts

the experimental result:

input size bank conflicts
32x32 0
64x64 0
512x512 0
1024x1024 randomly from 3900 to 4463 (expect 0)

GTX 3080, Driver Version: 525.125.06 CUDA Version: 12.0

could anyone help? greatly appreciate

Can you file an issue in the Nsight Compute forum with all the details?