Hi,
Recently, I meet a very strange problem, when I want to store some caculate result into shared memory there is some bank conflicts, but if I store the index number into it, the bank conflicts is disappeared. I thought the bank conflicts is only related to shared memory index.
the code without bank conflicts:
#include <stdlib.h>
#include <stdio.h>
#include <cuda.h>
#include <helper_cuda.h>
#define CHANNEL_TOTAL (32)
#define WIDTH (640)
#define HEIGHT (480)
typedef float fftwf_complex[2];
__global__ void sharedMemoryTest(float* in, float* out)
{
__shared__ float data[CHANNEL_TOTAL];
//printf("threadId = %d\r\n", threadIdx.x);
data[threadIdx.x] = in[threadIdx.x];
//printf("[%d] = %f\r\n", threadIdx.x, data[threadIdx.x]);
out[threadIdx.x] = data[threadIdx.x];
}
__global__ void sharedMemoryTest_1(float* in, float* out)
{
__shared__ float data[CHANNEL_TOTAL * 17];
//printf("threadId = %d\r\n", threadIdx.x);
//data[threadIdx.x] = in[threadIdx.x];
int tid = threadIdx.x;
int row = tid / 16;
int column = tid % 16;
data[row * 17 + column] = in[tid];
//float number = data[17 * tid];
//printf("[%d] = %f\r\n", tid, number);
out[threadIdx.x] = data[row * 17 + column];
}
__global__ void sharedMemoryTest_2(float* in, float* out)
{
__shared__ float data[CHANNEL_TOTAL * 33];
//printf("threadId = %d\r\n", threadIdx.x);
//data[threadIdx.x] = in[threadIdx.x];
int tid = threadIdx.x;
int row = tid / 32;
int column = tid % 32;
data[row * 33 + column] = in[tid];
float number = data[row * 33 + column];
//printf("[%d] = %f\r\n", tid, number);
out[threadIdx.x] = number;
}
template<unsigned int width, unsigned int height, unsigned int channelTotal>
__global__ void sound_beamforming_3(float* x, float* y, float* px, float* py,
fftwf_complex* m_rxDatas,
float* resultData)
{
int blockId = blockIdx.z * gridDim.y * gridDim.x + blockIdx.y * gridDim.x + blockIdx.x;
int threadId = threadIdx.z * blockDim.y * blockDim.x + threadIdx.y * blockDim.x + threadIdx.x;
int blockSize = blockDim.x * blockDim.y * blockDim.z;
int threadIndex = (blockId) * (blockDim.x * blockDim.y * blockDim.z) + threadId;
// __shared__ float sharedPX[channelTotal];
// __shared__ float sharedPY[channelTotal];
// __shared__ float sharedX[width];
// __shared__ float sharedY[height];
__shared__ float sharedSteerVector_0[channelTotal * channelTotal];
__shared__ float sharedSteerVector_1[channelTotal * channelTotal];
// sharedPX[threadId] = px[threadId];
// sharedPY[threadId] = py[threadId];
// int xCount = (width + channelTotal - 1) / channelTotal;
// for (int i = 0; i < xCount; i++) {
// int index = i * xCount + threadId;
// sharedX[index] = x[index];
// //printf("threadId = %d, index = %d, xCount = %d\r\n", threadId, index, xCount);
// }
// int yCount = (height + channelTotal - 1) / channelTotal;
// for (int i = 0; i < yCount; i++) {
// int index = i * yCount + threadId;
// sharedY[index] = y[index];
// }
// for (int i = 0; i < channelTotal; i++) {
// int index = i * channelTotal + threadId;
// sharedSteerVector_0[index] = index;
// sharedSteerVector_1[index] = index;
// }
int i = threadIndex / width;
int j = threadIndex % width;
float Temp1 = .0f;
for (int k=0; k < channelTotal; k++)
{
// float a = sharedX[j] - sharedPX[k] * 0.001f;
// float b = sharedY[i] - sharedPY[k] * 0.001f;
float a = x[j] - px[k] * 0.001f;
float b = y[i] - py[k] * 0.001f;
float c = a * b;
// int index = k * channelTotal + threadId;
// sharedSteerVector_0[index] = c;
// sharedSteerVector_1[index] = c;
int index = k * channelTotal + threadId;
sharedSteerVector_0[index] = index;
sharedSteerVector_1[index] = index;
}
Temp1 += sharedSteerVector_0[0] + sharedSteerVector_1[0];
resultData[threadIndex] = Temp1;
}
int main()
{
int times = 100;
cudaDeviceSetLimit(cudaLimitMallocHeapSize, 128*1024*1024);
//float* data = (float*)malloc(CHANNEL_TOTAL * sizeof (float));
float* data = nullptr;
checkCudaErrors(cudaMallocHost(&data, CHANNEL_TOTAL * sizeof (float), cudaHostAllocMapped));
float* dataOut = nullptr;
checkCudaErrors(cudaMallocHost(&dataOut, CHANNEL_TOTAL * sizeof (float), cudaHostAllocMapped));
for (int i = 0; i < CHANNEL_TOTAL; i++) {
data[i] = i;
}
float* x = nullptr;
checkCudaErrors(cudaMallocHost(&x, CHANNEL_TOTAL * sizeof (float), cudaHostAllocMapped));
float* y = nullptr;
checkCudaErrors(cudaMallocHost(&y, CHANNEL_TOTAL * sizeof (float), cudaHostAllocMapped));
float* px = nullptr;
checkCudaErrors(cudaMallocHost(&px, WIDTH * sizeof (float), cudaHostAllocMapped));
float* py = nullptr;
checkCudaErrors(cudaMallocHost(&py, HEIGHT * sizeof (float), cudaHostAllocMapped));
fftwf_complex* rxData = nullptr;
checkCudaErrors(cudaMallocHost(&rxData, CHANNEL_TOTAL * CHANNEL_TOTAL * sizeof (fftwf_complex), cudaHostAllocMapped));
float* result = nullptr;
checkCudaErrors(cudaMallocHost(&result, WIDTH * HEIGHT * sizeof (float), cudaHostAllocMapped));
for (int i = 0; i < times; i++) {
sharedMemoryTest<<<640 * 480 / CHANNEL_TOTAL, CHANNEL_TOTAL>>>(data, dataOut);
sharedMemoryTest_1<<<640 * 480 / CHANNEL_TOTAL, CHANNEL_TOTAL>>>(data, dataOut);
sharedMemoryTest_2<<<640 * 480 / CHANNEL_TOTAL, CHANNEL_TOTAL>>>(data, dataOut);
sound_beamforming_3<WIDTH, HEIGHT, CHANNEL_TOTAL><<<640 * 480 / CHANNEL_TOTAL, CHANNEL_TOTAL>>>(x, y, px, py, rxData, result);
}
cudaDeviceSynchronize();
return 0;
}
the code with bank conflicts
#include <stdlib.h>
#include <stdio.h>
#include <cuda.h>
#include <helper_cuda.h>
#define CHANNEL_TOTAL (32)
#define WIDTH (640)
#define HEIGHT (480)
typedef float fftwf_complex[2];
__global__ void sharedMemoryTest(float* in, float* out)
{
__shared__ float data[CHANNEL_TOTAL];
//printf("threadId = %d\r\n", threadIdx.x);
data[threadIdx.x] = in[threadIdx.x];
//printf("[%d] = %f\r\n", threadIdx.x, data[threadIdx.x]);
out[threadIdx.x] = data[threadIdx.x];
}
__global__ void sharedMemoryTest_1(float* in, float* out)
{
__shared__ float data[CHANNEL_TOTAL * 17];
//printf("threadId = %d\r\n", threadIdx.x);
//data[threadIdx.x] = in[threadIdx.x];
int tid = threadIdx.x;
int row = tid / 16;
int column = tid % 16;
data[row * 17 + column] = in[tid];
//float number = data[17 * tid];
//printf("[%d] = %f\r\n", tid, number);
out[threadIdx.x] = data[row * 17 + column];
}
__global__ void sharedMemoryTest_2(float* in, float* out)
{
__shared__ float data[CHANNEL_TOTAL * 33];
//printf("threadId = %d\r\n", threadIdx.x);
//data[threadIdx.x] = in[threadIdx.x];
int tid = threadIdx.x;
int row = tid / 32;
int column = tid % 32;
data[row * 33 + column] = in[tid];
float number = data[row * 33 + column];
//printf("[%d] = %f\r\n", tid, number);
out[threadIdx.x] = number;
}
template<unsigned int width, unsigned int height, unsigned int channelTotal>
__global__ void sound_beamforming_3(float* x, float* y, float* px, float* py,
fftwf_complex* m_rxDatas,
float* resultData)
{
int blockId = blockIdx.z * gridDim.y * gridDim.x + blockIdx.y * gridDim.x + blockIdx.x;
int threadId = threadIdx.z * blockDim.y * blockDim.x + threadIdx.y * blockDim.x + threadIdx.x;
int blockSize = blockDim.x * blockDim.y * blockDim.z;
int threadIndex = (blockId) * (blockDim.x * blockDim.y * blockDim.z) + threadId;
// __shared__ float sharedPX[channelTotal];
// __shared__ float sharedPY[channelTotal];
// __shared__ float sharedX[width];
// __shared__ float sharedY[height];
__shared__ float sharedSteerVector_0[channelTotal * channelTotal];
__shared__ float sharedSteerVector_1[channelTotal * channelTotal];
// sharedPX[threadId] = px[threadId];
// sharedPY[threadId] = py[threadId];
// int xCount = (width + channelTotal - 1) / channelTotal;
// for (int i = 0; i < xCount; i++) {
// int index = i * xCount + threadId;
// sharedX[index] = x[index];
// //printf("threadId = %d, index = %d, xCount = %d\r\n", threadId, index, xCount);
// }
// int yCount = (height + channelTotal - 1) / channelTotal;
// for (int i = 0; i < yCount; i++) {
// int index = i * yCount + threadId;
// sharedY[index] = y[index];
// }
// for (int i = 0; i < channelTotal; i++) {
// int index = i * channelTotal + threadId;
// sharedSteerVector_0[index] = index;
// sharedSteerVector_1[index] = index;
// }
int i = threadIndex / width;
int j = threadIndex % width;
float Temp1 = .0f;
for (int k=0; k < channelTotal; k++)
{
// float a = sharedX[j] - sharedPX[k] * 0.001f;
// float b = sharedY[i] - sharedPY[k] * 0.001f;
float a = x[j] - px[k] * 0.001f;
float b = y[i] - py[k] * 0.001f;
float c = a * b;
int index = k * channelTotal + threadId;
sharedSteerVector_0[index] = c;
sharedSteerVector_1[index] = c;
// int index = k * channelTotal + threadId;
// sharedSteerVector_0[index] = index;
// sharedSteerVector_1[index] = index;
}
Temp1 += sharedSteerVector_0[0] + sharedSteerVector_1[0];
resultData[threadIndex] = Temp1;
}
int main()
{
int times = 100;
cudaDeviceSetLimit(cudaLimitMallocHeapSize, 128*1024*1024);
//float* data = (float*)malloc(CHANNEL_TOTAL * sizeof (float));
float* data = nullptr;
checkCudaErrors(cudaMallocHost(&data, CHANNEL_TOTAL * sizeof (float), cudaHostAllocMapped));
float* dataOut = nullptr;
checkCudaErrors(cudaMallocHost(&dataOut, CHANNEL_TOTAL * sizeof (float), cudaHostAllocMapped));
for (int i = 0; i < CHANNEL_TOTAL; i++) {
data[i] = i;
}
float* x = nullptr;
checkCudaErrors(cudaMallocHost(&x, CHANNEL_TOTAL * sizeof (float), cudaHostAllocMapped));
float* y = nullptr;
checkCudaErrors(cudaMallocHost(&y, CHANNEL_TOTAL * sizeof (float), cudaHostAllocMapped));
float* px = nullptr;
checkCudaErrors(cudaMallocHost(&px, WIDTH * sizeof (float), cudaHostAllocMapped));
float* py = nullptr;
checkCudaErrors(cudaMallocHost(&py, HEIGHT * sizeof (float), cudaHostAllocMapped));
fftwf_complex* rxData = nullptr;
checkCudaErrors(cudaMallocHost(&rxData, CHANNEL_TOTAL * CHANNEL_TOTAL * sizeof (fftwf_complex), cudaHostAllocMapped));
float* result = nullptr;
checkCudaErrors(cudaMallocHost(&result, WIDTH * HEIGHT * sizeof (float), cudaHostAllocMapped));
for (int i = 0; i < times; i++) {
sharedMemoryTest<<<640 * 480 / CHANNEL_TOTAL, CHANNEL_TOTAL>>>(data, dataOut);
sharedMemoryTest_1<<<640 * 480 / CHANNEL_TOTAL, CHANNEL_TOTAL>>>(data, dataOut);
sharedMemoryTest_2<<<640 * 480 / CHANNEL_TOTAL, CHANNEL_TOTAL>>>(data, dataOut);
sound_beamforming_3<WIDTH, HEIGHT, CHANNEL_TOTAL><<<640 * 480 / CHANNEL_TOTAL, CHANNEL_TOTAL>>>(x, y, px, py, rxData, result);
}
cudaDeviceSynchronize();
return 0;
}
complie command:
/usr/local/cuda-10.2/bin/nvcc -ftz=true -prec-div=false -prec-sqrt=false --use_fast_math --compiler-options -fPIC -I"/usr/local/cuda-10.2/include" -I"/usr/local/cuda-10.2/samples/common/inc" -lcuda -lcudadevrt -lcudart -lcublas -lcufft --machine 64 -gencode arch=compute_30,code=sm_30 -gencode arch=compute_32,code=sm_32 -gencode arch=compute_53,code=sm_53 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_62,code=sm_62 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_72,code=sm_72 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_75,code=compute_75 -O0 -c -o test_bank_conflict.o test_bank_conflict.cu
g++ test_bank_conflict.o -L/usr/local/cuda-10.2/lib64 -lcuda -lcurand -lcudadevrt -lcudart -lcublas -lcufft -o main
sudo $(which nvprof) --metrics sm_efficiency,achieved_occupancy,gld_throughput,gst_throughput,gld_efficiency,gst_efficiency,gld_transactions,gst_transactions,gld_transactions_per_request,gst_transactions_per_request,branch_efficiency,shared_efficiency,shared_load_throughput,shared_store_throughput,shared_store_transactions_per_request,local_load_throughput,local_store_throughput,tex_cache_hit_rate,tex_cache_transactions --events shared_ld_bank_conflict,shared_st_bank_conflict ./main