Very strange share memory bank conflicts

mozzie · October 9, 2021, 9:28am

Hi,

Recently, I meet a very strange problem, when I want to store some caculate result into shared memory there is some bank conflicts, but if I store the index number into it, the bank conflicts is disappeared. I thought the bank conflicts is only related to shared memory index.

the code without bank conflicts:

#include <stdlib.h>
#include <stdio.h>

#include <cuda.h>
#include <helper_cuda.h>

#define CHANNEL_TOTAL (32)
#define WIDTH (640)
#define HEIGHT (480)

typedef float fftwf_complex[2];

__global__ void sharedMemoryTest(float* in, float* out)
{
	__shared__ float data[CHANNEL_TOTAL];
	//printf("threadId = %d\r\n", threadIdx.x);
	data[threadIdx.x] = in[threadIdx.x];

	//printf("[%d] = %f\r\n", threadIdx.x, data[threadIdx.x]);
	out[threadIdx.x] = data[threadIdx.x];
}

__global__ void sharedMemoryTest_1(float* in, float* out)
{
	__shared__ float data[CHANNEL_TOTAL * 17];
	//printf("threadId = %d\r\n", threadIdx.x);
	//data[threadIdx.x] = in[threadIdx.x];

	int tid = threadIdx.x;
	int row = tid / 16;
	int column = tid % 16;
	data[row * 17 + column] = in[tid];

	//float number = data[17 * tid];
	//printf("[%d] = %f\r\n", tid, number);
	out[threadIdx.x] = data[row * 17 + column];
}

__global__ void sharedMemoryTest_2(float* in, float* out)
{
	__shared__ float data[CHANNEL_TOTAL * 33];
	//printf("threadId = %d\r\n", threadIdx.x);
	//data[threadIdx.x] = in[threadIdx.x];

	int tid = threadIdx.x;
	int row = tid / 32;
	int column = tid % 32;
	data[row * 33 + column] = in[tid];

	float number = data[row * 33 + column];
	//printf("[%d] = %f\r\n", tid, number);
	out[threadIdx.x] = number;
}


template<unsigned int width, unsigned int height, unsigned int channelTotal>
__global__ void sound_beamforming_3(float* x, float* y, float* px, float* py,
									fftwf_complex* m_rxDatas,
									float* resultData)
{
	int blockId = blockIdx.z * gridDim.y * gridDim.x + blockIdx.y * gridDim.x + blockIdx.x;
	int threadId = threadIdx.z * blockDim.y * blockDim.x + threadIdx.y * blockDim.x + threadIdx.x;
	int blockSize = blockDim.x * blockDim.y * blockDim.z;
	int threadIndex = (blockId) * (blockDim.x * blockDim.y * blockDim.z) + threadId;

	//    __shared__ float sharedPX[channelTotal];
	//    __shared__ float sharedPY[channelTotal];
	//    __shared__ float sharedX[width];
	//    __shared__ float sharedY[height];
	__shared__ float sharedSteerVector_0[channelTotal * channelTotal];
	__shared__ float sharedSteerVector_1[channelTotal * channelTotal];


	//    sharedPX[threadId] = px[threadId];
	//    sharedPY[threadId] = py[threadId];

	//    int xCount = (width + channelTotal - 1) / channelTotal;
	//    for (int i = 0; i < xCount; i++) {
	//        int index = i * xCount + threadId;
	//        sharedX[index] = x[index];

	//        //printf("threadId = %d, index = %d, xCount = %d\r\n", threadId, index, xCount);
	//    }
	//    int yCount = (height + channelTotal - 1) / channelTotal;
	//    for (int i = 0; i < yCount; i++) {
	//        int index = i * yCount + threadId;
	//        sharedY[index] = y[index];
	//    }

//	for (int i = 0; i < channelTotal; i++) {
//		int index = i * channelTotal + threadId;
//		sharedSteerVector_0[index] = index;
//		sharedSteerVector_1[index] = index;
//	}

	int i = threadIndex / width;
	int j = threadIndex % width;

	float Temp1 = .0f;
	for (int k=0; k < channelTotal; k++)
	{
		//		float a = sharedX[j] - sharedPX[k] * 0.001f;
		//        float b = sharedY[i] - sharedPY[k] * 0.001f;
		float a = x[j] - px[k] * 0.001f;
		float b = y[i] - py[k] * 0.001f;

		float c = a * b;


//		int index = k * channelTotal + threadId;
//		sharedSteerVector_0[index] = c;
//		sharedSteerVector_1[index] = c;

		int index = k * channelTotal + threadId;
		sharedSteerVector_0[index] = index;
		sharedSteerVector_1[index] = index;

	}
	Temp1 += sharedSteerVector_0[0] + sharedSteerVector_1[0];


	resultData[threadIndex] = Temp1;
}

int main()
{
	int times = 100;
	cudaDeviceSetLimit(cudaLimitMallocHeapSize, 128*1024*1024);


	//float* data = (float*)malloc(CHANNEL_TOTAL * sizeof (float));
	float* data = nullptr;
	checkCudaErrors(cudaMallocHost(&data, CHANNEL_TOTAL * sizeof (float), cudaHostAllocMapped));
	float* dataOut = nullptr;
	checkCudaErrors(cudaMallocHost(&dataOut, CHANNEL_TOTAL * sizeof (float), cudaHostAllocMapped));
	for (int i = 0; i < CHANNEL_TOTAL; i++) {
		data[i] = i;
	}

	float* x = nullptr;
	checkCudaErrors(cudaMallocHost(&x, CHANNEL_TOTAL * sizeof (float), cudaHostAllocMapped));
	float* y = nullptr;
	checkCudaErrors(cudaMallocHost(&y, CHANNEL_TOTAL * sizeof (float), cudaHostAllocMapped));
	float* px = nullptr;
	checkCudaErrors(cudaMallocHost(&px, WIDTH * sizeof (float), cudaHostAllocMapped));
	float* py = nullptr;
	checkCudaErrors(cudaMallocHost(&py, HEIGHT * sizeof (float), cudaHostAllocMapped));

	fftwf_complex* rxData = nullptr;
	checkCudaErrors(cudaMallocHost(&rxData, CHANNEL_TOTAL * CHANNEL_TOTAL * sizeof (fftwf_complex), cudaHostAllocMapped));

	float* result = nullptr;
	checkCudaErrors(cudaMallocHost(&result, WIDTH * HEIGHT * sizeof (float), cudaHostAllocMapped));


	for (int i = 0; i < times; i++) {
		sharedMemoryTest<<<640 * 480 / CHANNEL_TOTAL, CHANNEL_TOTAL>>>(data, dataOut);
		sharedMemoryTest_1<<<640 * 480 / CHANNEL_TOTAL, CHANNEL_TOTAL>>>(data, dataOut);
		sharedMemoryTest_2<<<640 * 480 / CHANNEL_TOTAL, CHANNEL_TOTAL>>>(data, dataOut);

		sound_beamforming_3<WIDTH, HEIGHT, CHANNEL_TOTAL><<<640 * 480 / CHANNEL_TOTAL, CHANNEL_TOTAL>>>(x, y, px, py, rxData, result);

	}
	cudaDeviceSynchronize();
	return 0;
}

the code with bank conflicts

#include <stdlib.h>
#include <stdio.h>

#include <cuda.h>
#include <helper_cuda.h>

#define CHANNEL_TOTAL (32)
#define WIDTH (640)
#define HEIGHT (480)

typedef float fftwf_complex[2];

__global__ void sharedMemoryTest(float* in, float* out)
{
	__shared__ float data[CHANNEL_TOTAL];
	//printf("threadId = %d\r\n", threadIdx.x);
	data[threadIdx.x] = in[threadIdx.x];

	//printf("[%d] = %f\r\n", threadIdx.x, data[threadIdx.x]);
	out[threadIdx.x] = data[threadIdx.x];
}

__global__ void sharedMemoryTest_1(float* in, float* out)
{
	__shared__ float data[CHANNEL_TOTAL * 17];
	//printf("threadId = %d\r\n", threadIdx.x);
	//data[threadIdx.x] = in[threadIdx.x];

	int tid = threadIdx.x;
	int row = tid / 16;
	int column = tid % 16;
	data[row * 17 + column] = in[tid];

	//float number = data[17 * tid];
	//printf("[%d] = %f\r\n", tid, number);
	out[threadIdx.x] = data[row * 17 + column];
}

__global__ void sharedMemoryTest_2(float* in, float* out)
{
	__shared__ float data[CHANNEL_TOTAL * 33];
	//printf("threadId = %d\r\n", threadIdx.x);
	//data[threadIdx.x] = in[threadIdx.x];

	int tid = threadIdx.x;
	int row = tid / 32;
	int column = tid % 32;
	data[row * 33 + column] = in[tid];

	float number = data[row * 33 + column];
	//printf("[%d] = %f\r\n", tid, number);
	out[threadIdx.x] = number;
}


template<unsigned int width, unsigned int height, unsigned int channelTotal>
__global__ void sound_beamforming_3(float* x, float* y, float* px, float* py,
									fftwf_complex* m_rxDatas,
									float* resultData)
{
	int blockId = blockIdx.z * gridDim.y * gridDim.x + blockIdx.y * gridDim.x + blockIdx.x;
	int threadId = threadIdx.z * blockDim.y * blockDim.x + threadIdx.y * blockDim.x + threadIdx.x;
	int blockSize = blockDim.x * blockDim.y * blockDim.z;
	int threadIndex = (blockId) * (blockDim.x * blockDim.y * blockDim.z) + threadId;

	//    __shared__ float sharedPX[channelTotal];
	//    __shared__ float sharedPY[channelTotal];
	//    __shared__ float sharedX[width];
	//    __shared__ float sharedY[height];
	__shared__ float sharedSteerVector_0[channelTotal * channelTotal];
	__shared__ float sharedSteerVector_1[channelTotal * channelTotal];


	//    sharedPX[threadId] = px[threadId];
	//    sharedPY[threadId] = py[threadId];

	//    int xCount = (width + channelTotal - 1) / channelTotal;
	//    for (int i = 0; i < xCount; i++) {
	//        int index = i * xCount + threadId;
	//        sharedX[index] = x[index];

	//        //printf("threadId = %d, index = %d, xCount = %d\r\n", threadId, index, xCount);
	//    }
	//    int yCount = (height + channelTotal - 1) / channelTotal;
	//    for (int i = 0; i < yCount; i++) {
	//        int index = i * yCount + threadId;
	//        sharedY[index] = y[index];
	//    }

//	for (int i = 0; i < channelTotal; i++) {
//		int index = i * channelTotal + threadId;
//		sharedSteerVector_0[index] = index;
//		sharedSteerVector_1[index] = index;
//	}

	int i = threadIndex / width;
	int j = threadIndex % width;

	float Temp1 = .0f;
	for (int k=0; k < channelTotal; k++)
	{
		//		float a = sharedX[j] - sharedPX[k] * 0.001f;
		//        float b = sharedY[i] - sharedPY[k] * 0.001f;
		float a = x[j] - px[k] * 0.001f;
		float b = y[i] - py[k] * 0.001f;

		float c = a * b;


		int index = k * channelTotal + threadId;
		sharedSteerVector_0[index] = c;
		sharedSteerVector_1[index] = c;

//		int index = k * channelTotal + threadId;
//		sharedSteerVector_0[index] = index;
//		sharedSteerVector_1[index] = index;

	}
	Temp1 += sharedSteerVector_0[0] + sharedSteerVector_1[0];


	resultData[threadIndex] = Temp1;
}

int main()
{
	int times = 100;
	cudaDeviceSetLimit(cudaLimitMallocHeapSize, 128*1024*1024);


	//float* data = (float*)malloc(CHANNEL_TOTAL * sizeof (float));
	float* data = nullptr;
	checkCudaErrors(cudaMallocHost(&data, CHANNEL_TOTAL * sizeof (float), cudaHostAllocMapped));
	float* dataOut = nullptr;
	checkCudaErrors(cudaMallocHost(&dataOut, CHANNEL_TOTAL * sizeof (float), cudaHostAllocMapped));
	for (int i = 0; i < CHANNEL_TOTAL; i++) {
		data[i] = i;
	}

	float* x = nullptr;
	checkCudaErrors(cudaMallocHost(&x, CHANNEL_TOTAL * sizeof (float), cudaHostAllocMapped));
	float* y = nullptr;
	checkCudaErrors(cudaMallocHost(&y, CHANNEL_TOTAL * sizeof (float), cudaHostAllocMapped));
	float* px = nullptr;
	checkCudaErrors(cudaMallocHost(&px, WIDTH * sizeof (float), cudaHostAllocMapped));
	float* py = nullptr;
	checkCudaErrors(cudaMallocHost(&py, HEIGHT * sizeof (float), cudaHostAllocMapped));

	fftwf_complex* rxData = nullptr;
	checkCudaErrors(cudaMallocHost(&rxData, CHANNEL_TOTAL * CHANNEL_TOTAL * sizeof (fftwf_complex), cudaHostAllocMapped));

	float* result = nullptr;
	checkCudaErrors(cudaMallocHost(&result, WIDTH * HEIGHT * sizeof (float), cudaHostAllocMapped));


	for (int i = 0; i < times; i++) {
		sharedMemoryTest<<<640 * 480 / CHANNEL_TOTAL, CHANNEL_TOTAL>>>(data, dataOut);
		sharedMemoryTest_1<<<640 * 480 / CHANNEL_TOTAL, CHANNEL_TOTAL>>>(data, dataOut);
		sharedMemoryTest_2<<<640 * 480 / CHANNEL_TOTAL, CHANNEL_TOTAL>>>(data, dataOut);

		sound_beamforming_3<WIDTH, HEIGHT, CHANNEL_TOTAL><<<640 * 480 / CHANNEL_TOTAL, CHANNEL_TOTAL>>>(x, y, px, py, rxData, result);

	}
	cudaDeviceSynchronize();
	return 0;
}

complie command:

/usr/local/cuda-10.2/bin/nvcc -ftz=true -prec-div=false -prec-sqrt=false --use_fast_math --compiler-options -fPIC -I"/usr/local/cuda-10.2/include" -I"/usr/local/cuda-10.2/samples/common/inc" -lcuda -lcudadevrt -lcudart -lcublas -lcufft --machine 64 -gencode arch=compute_30,code=sm_30 -gencode arch=compute_32,code=sm_32 -gencode arch=compute_53,code=sm_53 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_62,code=sm_62 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_72,code=sm_72 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_75,code=compute_75 -O0 -c -o test_bank_conflict.o test_bank_conflict.cu 
g++ test_bank_conflict.o -L/usr/local/cuda-10.2/lib64 -lcuda -lcurand -lcudadevrt -lcudart -lcublas -lcufft -o main
sudo $(which nvprof) --metrics sm_efficiency,achieved_occupancy,gld_throughput,gst_throughput,gld_efficiency,gst_efficiency,gld_transactions,gst_transactions,gld_transactions_per_request,gst_transactions_per_request,branch_efficiency,shared_efficiency,shared_load_throughput,shared_store_throughput,shared_store_transactions_per_request,local_load_throughput,local_store_throughput,tex_cache_hit_rate,tex_cache_transactions --events shared_ld_bank_conflict,shared_st_bank_conflict ./main

I have tested the code on my jetson tx2,
and the most most strange thing is, after running both of the code on my jetson tx1, there is no bank conflicts at all!!!
What???

achartiernv · October 15, 2021, 7:54pm

Hello, this forum is dedicated to discussions related to using the cuda-memcheck tools.
Questions related to nvprof can be raised at Visual Profiler and nvprof - NVIDIA Developer Forums
You can also try the new Nsight Compute profiler: Nsight Compute :: CUDA Toolkit Documentation

Topic		Replies	Views
Why there is random bank conflicts? CUDA-MEMCHECK cuda	2	1282	September 19, 2023
Very strange share memory bank conflicts CUDA Programming and Performance cuda	4	610	November 2, 2021
How to understand the bank conflict of shared_mem CUDA Programming and Performance	16	16488	November 19, 2025
The increase of the shared memory size leads to the bankconflict (from 9 KB shared memory) Nsight Compute	5	651	July 14, 2023
Allocating more share memory than needed resulted in bank conflict CUDA Programming and Performance	7	319	October 19, 2024
Shared memory bank conflict CUDA Programming and Performance	4	638	July 30, 2025
Avoiding shared memory bank conflicts CUDA Programming and Performance	3	3157	October 12, 2010
About bank conflict of shared_mem CUDA Programming and Performance	2	542	July 25, 2023
Will this code cause bank conflict ? CUDA Programming and Performance	1	500	October 9, 2018
Accessing Shared memory: Unexpected timing result "Naive" access seems to be twice as fast a CUDA Programming and Performance	3	1254	February 27, 2012

Very strange share memory bank conflicts

Related topics