Hi,
I am quit confused with shared memory bank conflicts. I found my code sometimes have bank conflicts, some times the bank conflicts is 0.
my code is tested on jetson tx2
#include <stdlib.h>
#include <stdio.h>
#include <cuda.h>
#include <helper_cuda.h>
#define CHANNEL_TOTAL (32)
__global__ void sharedMemoryTest(float* in)
{
__shared__ float data[CHANNEL_TOTAL];
//printf("threadId = %d\r\n", threadIdx.x);
data[threadIdx.x] = in[threadIdx.x];
printf("[%d] = %f\r\n", threadIdx.x, data[threadIdx.x]);
}
__global__ void sharedMemoryTest_1(float* in)
{
__shared__ float data[CHANNEL_TOTAL * 17];
//printf("threadId = %d\r\n", threadIdx.x);
//data[threadIdx.x] = in[threadIdx.x];
int tid = threadIdx.x;
int row = tid / 16;
int column = tid % 16;
data[row * 17 + column] = in[tid];
float number = data[17 * tid];
printf("[%d] = %f\r\n", tid, number);
}
__global__ void sharedMemoryTest_2(float* in)
{
__shared__ float data[CHANNEL_TOTAL * 33];
//printf("threadId = %d\r\n", threadIdx.x);
//data[threadIdx.x] = in[threadIdx.x];
int tid = threadIdx.x;
int row = tid / 32;
int column = tid % 32;
data[row * 33 + column] = in[tid];
float number = data[row * 33 + column];
printf("[%d] = %f\r\n", tid, number);
}
int main()
{
int times = 100;
//float* data = (float*)malloc(CHANNEL_TOTAL * sizeof (float));
float* data = nullptr;
checkCudaErrors(cudaMallocHost(&data, CHANNEL_TOTAL * sizeof (float), cudaHostAllocMapped));
for (int i = 0; i < CHANNEL_TOTAL; i++) {
data[i] = i;
}
cudaDeviceSetLimit(cudaLimitMallocHeapSize, 128*1024*1024);
for (int i = 0; i < times; i++) {
sharedMemoryTest<<<640 * 480 / CHANNEL_TOTAL, CHANNEL_TOTAL>>>(data);
sharedMemoryTest_1<<<640 * 480 / CHANNEL_TOTAL, CHANNEL_TOTAL>>>(data);
sharedMemoryTest_2<<<640 * 480 / CHANNEL_TOTAL, CHANNEL_TOTAL>>>(data);
}
cudaDeviceSynchronize();
return 0;
}
and the nvprof result:
Invocations Event Name Min Max Avg Total
Device "NVIDIA Tegra X2 (0)"
Kernel: sharedMemoryTest_1(float*)
100 shared_ld_bank_conflict 0 2088 511 51128
100 shared_st_bank_conflict 9600 12008 10189 1018962
Kernel: sharedMemoryTest_2(float*)
100 shared_ld_bank_conflict 0 2088 469 46943
100 shared_st_bank_conflict 0 2408 541 54146
Kernel: sharedMemoryTest(float*)
100 shared_ld_bank_conflict 0 2088 500 50098
100 shared_st_bank_conflict 0 2408 577 57778
==29128== Metric result:
Invocations Metric Name Metric Description Min Max Avg
Device "NVIDIA Tegra X2 (0)"
Kernel: sharedMemoryTest_1(float*)
100 sm_efficiency Multiprocessor Activity 99.79% 100.00% 99.95%
100 achieved_occupancy Achieved Occupancy 0.436820 0.437105 0.436885
100 gld_throughput Global Load Throughput 10.271MB/s 10.588MB/s 10.430MB/s
100 gst_throughput Global Store Throughput 16.707GB/s 17.213GB/s 16.958GB/s
100 gld_efficiency Global Memory Load Efficiency 100.00% 100.00% 100.00%
100 gst_efficiency Global Memory Store Efficiency 0.00% 0.00% 0.00%
100 gld_transactions Global Load Transactions 1689602 2203770 1760970
100 gst_transactions Global Store Transactions 63926400 63958432 63935042
100 gld_transactions_per_request Global Load Transactions Per Request 24.999832 68.469110 55.794881
100 gst_transactions_per_request Global Store Transactions Per Request 29.860987 29.875949 29.865024
100 branch_efficiency Branch Efficiency 99.88% 99.88% 99.88%
100 shared_efficiency Shared Memory Efficiency 57.66% 66.67% 64.36%
100 shared_load_throughput Shared Memory Load Throughput 10.271MB/s 13.174MB/s 11.084MB/s
100 shared_store_throughput Shared Memory Store Throughput 20.543MB/s 23.355MB/s 21.426MB/s
100 shared_store_transactions_per_request Shared Memory Store Transactions Per Request 2.000000 2.217500 2.054359
100 local_load_throughput Local Memory Load Throughput 10.767GB/s 11.092GB/s 10.929GB/s
100 local_store_throughput Local Memory Store Throughput 1.2369GB/s 1.2763GB/s 1.2536GB/s
100 tex_cache_hit_rate Unified Cache Hit Rate 35.72% 35.86% 35.78%
100 tex_cache_transactions Unified Cache Transactions 22150027 22289874 22183257
Kernel: sharedMemoryTest_2(float*)
100 sm_efficiency Multiprocessor Activity 99.87% 100.00% 99.97%
100 achieved_occupancy Achieved Occupancy 0.234150 0.234189 0.234166
100 gld_throughput Global Load Throughput 11.305MB/s 11.457MB/s 11.391MB/s
100 gst_throughput Global Store Throughput 18.383GB/s 18.630GB/s 18.520GB/s
100 gld_efficiency Global Memory Load Efficiency 100.00% 100.00% 100.00%
100 gst_efficiency Global Memory Store Efficiency 0.00% 0.00% 0.00%
100 gld_transactions Global Load Transactions 1689602 1768610 1707618
100 gst_transactions Global Store Transactions 63926400 63951240 63930730
100 gld_transactions_per_request Global Load Transactions Per Request 58.666736 61.410069 59.292317
100 gst_transactions_per_request Global Store Transactions Per Request 29.860987 29.872590 29.863009
100 branch_efficiency Branch Efficiency 99.88% 99.88% 99.88%
100 shared_efficiency Shared Memory Efficiency 68.10% 100.00% 94.04%
100 shared_load_throughput Shared Memory Load Throughput 0.00000B/s 2.8625MB/s 482.50KB/s
100 shared_store_throughput Shared Memory Store Throughput 11.305MB/s 13.894MB/s 11.799MB/s
100 shared_store_transactions_per_request Shared Memory Store Transactions Per Request 1.000000 1.217500 1.035874
100 local_load_throughput Local Memory Load Throughput 11.844GB/s 12.003GB/s 11.934GB/s
100 local_store_throughput Local Memory Store Throughput 1.3592GB/s 1.3775GB/s 1.3687GB/s
100 tex_cache_hit_rate Unified Cache Hit Rate 36.32% 36.37% 36.35%
100 tex_cache_transactions Unified Cache Transactions 22151709 22179648 22157582
Kernel: sharedMemoryTest(float*)
100 sm_efficiency Multiprocessor Activity 99.86% 100.00% 99.96%
100 achieved_occupancy Achieved Occupancy 0.499137 0.499276 0.499197
100 gld_throughput Global Load Throughput 10.265MB/s 10.517MB/s 10.417MB/s
100 gst_throughput Global Store Throughput 16.693GB/s 17.098GB/s 16.937GB/s
100 gld_efficiency Global Memory Load Efficiency 100.00% 100.00% 100.00%
100 gst_efficiency Global Memory Store Efficiency 0.00% 635.75% 6.36%
100 gld_transactions Global Load Transactions 1689602 1807018 1714552
100 gst_transactions Global Store Transactions 63926400 63946624 63931857
100 gld_transactions_per_request Global Load Transactions Per Request 58.666736 62.743681 59.533088
100 gst_transactions_per_request Global Store Transactions Per Request 29.860987 29.870433 29.863536
100 branch_efficiency Branch Efficiency 99.88% 99.88% 99.88%
100 shared_efficiency Shared Memory Efficiency 68.10% 100.00% 91.10%
100 shared_load_throughput Shared Memory Load Throughput 0.00000B/s 2.6254MB/s 695.48KB/s
100 shared_store_throughput Shared Memory Store Throughput 10.293MB/s 12.743MB/s 11.006MB/s
100 shared_store_transactions_per_request Shared Memory Store Transactions Per Request 1.000000 1.217500 1.056539
100 local_load_throughput Local Memory Load Throughput 10.757GB/s 11.018GB/s 10.915GB/s
100 local_store_throughput Local Memory Store Throughput 1.2329GB/s 1.2674GB/s 1.2527GB/s
100 tex_cache_hit_rate Unified Cache Hit Rate 35.52% 35.61% 35.57%
100 tex_cache_transactions Unified Cache Transactions 22150995 22194901 22163492
Thanks.