Very strange share memory bank conflicts

Hi,

Recently, I meet a very strange problem, when I want to store some caculate result into shared memory there is some bank conflicts, but if I store the index number into it, the bank conflicts is disappeared. I thought the bank conflicts is only related to shared memory index.

the code without bank conflicts:

#include <stdlib.h>
#include <stdio.h>

#include <cuda.h>
#include <helper_cuda.h>

#define CHANNEL_TOTAL (32)
#define WIDTH (640)
#define HEIGHT (480)

typedef float fftwf_complex[2];

__global__ void sharedMemoryTest(float* in, float* out)
{
	__shared__ float data[CHANNEL_TOTAL];
	//printf("threadId = %d\r\n", threadIdx.x);
	data[threadIdx.x] = in[threadIdx.x];

	//printf("[%d] = %f\r\n", threadIdx.x, data[threadIdx.x]);
	out[threadIdx.x] = data[threadIdx.x];
}

__global__ void sharedMemoryTest_1(float* in, float* out)
{
	__shared__ float data[CHANNEL_TOTAL * 17];
	//printf("threadId = %d\r\n", threadIdx.x);
	//data[threadIdx.x] = in[threadIdx.x];

	int tid = threadIdx.x;
	int row = tid / 16;
	int column = tid % 16;
	data[row * 17 + column] = in[tid];

	//float number = data[17 * tid];
	//printf("[%d] = %f\r\n", tid, number);
	out[threadIdx.x] = data[row * 17 + column];
}

__global__ void sharedMemoryTest_2(float* in, float* out)
{
	__shared__ float data[CHANNEL_TOTAL * 33];
	//printf("threadId = %d\r\n", threadIdx.x);
	//data[threadIdx.x] = in[threadIdx.x];

	int tid = threadIdx.x;
	int row = tid / 32;
	int column = tid % 32;
	data[row * 33 + column] = in[tid];

	float number = data[row * 33 + column];
	//printf("[%d] = %f\r\n", tid, number);
	out[threadIdx.x] = number;
}


template<unsigned int width, unsigned int height, unsigned int channelTotal>
__global__ void sound_beamforming_3(float* x, float* y, float* px, float* py,
									fftwf_complex* m_rxDatas,
									float* resultData)
{
	int blockId = blockIdx.z * gridDim.y * gridDim.x + blockIdx.y * gridDim.x + blockIdx.x;
	int threadId = threadIdx.z * blockDim.y * blockDim.x + threadIdx.y * blockDim.x + threadIdx.x;
	int blockSize = blockDim.x * blockDim.y * blockDim.z;
	int threadIndex = (blockId) * (blockDim.x * blockDim.y * blockDim.z) + threadId;

	//    __shared__ float sharedPX[channelTotal];
	//    __shared__ float sharedPY[channelTotal];
	//    __shared__ float sharedX[width];
	//    __shared__ float sharedY[height];
	__shared__ float sharedSteerVector_0[channelTotal * channelTotal];
	__shared__ float sharedSteerVector_1[channelTotal * channelTotal];


	//    sharedPX[threadId] = px[threadId];
	//    sharedPY[threadId] = py[threadId];

	//    int xCount = (width + channelTotal - 1) / channelTotal;
	//    for (int i = 0; i < xCount; i++) {
	//        int index = i * xCount + threadId;
	//        sharedX[index] = x[index];

	//        //printf("threadId = %d, index = %d, xCount = %d\r\n", threadId, index, xCount);
	//    }
	//    int yCount = (height + channelTotal - 1) / channelTotal;
	//    for (int i = 0; i < yCount; i++) {
	//        int index = i * yCount + threadId;
	//        sharedY[index] = y[index];
	//    }

//	for (int i = 0; i < channelTotal; i++) {
//		int index = i * channelTotal + threadId;
//		sharedSteerVector_0[index] = index;
//		sharedSteerVector_1[index] = index;
//	}

	int i = threadIndex / width;
	int j = threadIndex % width;

	float Temp1 = .0f;
	for (int k=0; k < channelTotal; k++)
	{
		//		float a = sharedX[j] - sharedPX[k] * 0.001f;
		//        float b = sharedY[i] - sharedPY[k] * 0.001f;
		float a = x[j] - px[k] * 0.001f;
		float b = y[i] - py[k] * 0.001f;

		float c = a * b;


//		int index = k * channelTotal + threadId;
//		sharedSteerVector_0[index] = c;
//		sharedSteerVector_1[index] = c;

		int index = k * channelTotal + threadId;
		sharedSteerVector_0[index] = index;
		sharedSteerVector_1[index] = index;

	}
	Temp1 += sharedSteerVector_0[0] + sharedSteerVector_1[0];


	resultData[threadIndex] = Temp1;
}

int main()
{
	int times = 100;
	cudaDeviceSetLimit(cudaLimitMallocHeapSize, 128*1024*1024);


	//float* data = (float*)malloc(CHANNEL_TOTAL * sizeof (float));
	float* data = nullptr;
	checkCudaErrors(cudaMallocHost(&data, CHANNEL_TOTAL * sizeof (float), cudaHostAllocMapped));
	float* dataOut = nullptr;
	checkCudaErrors(cudaMallocHost(&dataOut, CHANNEL_TOTAL * sizeof (float), cudaHostAllocMapped));
	for (int i = 0; i < CHANNEL_TOTAL; i++) {
		data[i] = i;
	}

	float* x = nullptr;
	checkCudaErrors(cudaMallocHost(&x, CHANNEL_TOTAL * sizeof (float), cudaHostAllocMapped));
	float* y = nullptr;
	checkCudaErrors(cudaMallocHost(&y, CHANNEL_TOTAL * sizeof (float), cudaHostAllocMapped));
	float* px = nullptr;
	checkCudaErrors(cudaMallocHost(&px, WIDTH * sizeof (float), cudaHostAllocMapped));
	float* py = nullptr;
	checkCudaErrors(cudaMallocHost(&py, HEIGHT * sizeof (float), cudaHostAllocMapped));

	fftwf_complex* rxData = nullptr;
	checkCudaErrors(cudaMallocHost(&rxData, CHANNEL_TOTAL * CHANNEL_TOTAL * sizeof (fftwf_complex), cudaHostAllocMapped));

	float* result = nullptr;
	checkCudaErrors(cudaMallocHost(&result, WIDTH * HEIGHT * sizeof (float), cudaHostAllocMapped));


	for (int i = 0; i < times; i++) {
		sharedMemoryTest<<<640 * 480 / CHANNEL_TOTAL, CHANNEL_TOTAL>>>(data, dataOut);
		sharedMemoryTest_1<<<640 * 480 / CHANNEL_TOTAL, CHANNEL_TOTAL>>>(data, dataOut);
		sharedMemoryTest_2<<<640 * 480 / CHANNEL_TOTAL, CHANNEL_TOTAL>>>(data, dataOut);

		sound_beamforming_3<WIDTH, HEIGHT, CHANNEL_TOTAL><<<640 * 480 / CHANNEL_TOTAL, CHANNEL_TOTAL>>>(x, y, px, py, rxData, result);

	}
	cudaDeviceSynchronize();
	return 0;
}

the code with bank conflicts

#include <stdlib.h>
#include <stdio.h>

#include <cuda.h>
#include <helper_cuda.h>

#define CHANNEL_TOTAL (32)
#define WIDTH (640)
#define HEIGHT (480)

typedef float fftwf_complex[2];

__global__ void sharedMemoryTest(float* in, float* out)
{
	__shared__ float data[CHANNEL_TOTAL];
	//printf("threadId = %d\r\n", threadIdx.x);
	data[threadIdx.x] = in[threadIdx.x];

	//printf("[%d] = %f\r\n", threadIdx.x, data[threadIdx.x]);
	out[threadIdx.x] = data[threadIdx.x];
}

__global__ void sharedMemoryTest_1(float* in, float* out)
{
	__shared__ float data[CHANNEL_TOTAL * 17];
	//printf("threadId = %d\r\n", threadIdx.x);
	//data[threadIdx.x] = in[threadIdx.x];

	int tid = threadIdx.x;
	int row = tid / 16;
	int column = tid % 16;
	data[row * 17 + column] = in[tid];

	//float number = data[17 * tid];
	//printf("[%d] = %f\r\n", tid, number);
	out[threadIdx.x] = data[row * 17 + column];
}

__global__ void sharedMemoryTest_2(float* in, float* out)
{
	__shared__ float data[CHANNEL_TOTAL * 33];
	//printf("threadId = %d\r\n", threadIdx.x);
	//data[threadIdx.x] = in[threadIdx.x];

	int tid = threadIdx.x;
	int row = tid / 32;
	int column = tid % 32;
	data[row * 33 + column] = in[tid];

	float number = data[row * 33 + column];
	//printf("[%d] = %f\r\n", tid, number);
	out[threadIdx.x] = number;
}


template<unsigned int width, unsigned int height, unsigned int channelTotal>
__global__ void sound_beamforming_3(float* x, float* y, float* px, float* py,
									fftwf_complex* m_rxDatas,
									float* resultData)
{
	int blockId = blockIdx.z * gridDim.y * gridDim.x + blockIdx.y * gridDim.x + blockIdx.x;
	int threadId = threadIdx.z * blockDim.y * blockDim.x + threadIdx.y * blockDim.x + threadIdx.x;
	int blockSize = blockDim.x * blockDim.y * blockDim.z;
	int threadIndex = (blockId) * (blockDim.x * blockDim.y * blockDim.z) + threadId;

	//    __shared__ float sharedPX[channelTotal];
	//    __shared__ float sharedPY[channelTotal];
	//    __shared__ float sharedX[width];
	//    __shared__ float sharedY[height];
	__shared__ float sharedSteerVector_0[channelTotal * channelTotal];
	__shared__ float sharedSteerVector_1[channelTotal * channelTotal];


	//    sharedPX[threadId] = px[threadId];
	//    sharedPY[threadId] = py[threadId];

	//    int xCount = (width + channelTotal - 1) / channelTotal;
	//    for (int i = 0; i < xCount; i++) {
	//        int index = i * xCount + threadId;
	//        sharedX[index] = x[index];

	//        //printf("threadId = %d, index = %d, xCount = %d\r\n", threadId, index, xCount);
	//    }
	//    int yCount = (height + channelTotal - 1) / channelTotal;
	//    for (int i = 0; i < yCount; i++) {
	//        int index = i * yCount + threadId;
	//        sharedY[index] = y[index];
	//    }

//	for (int i = 0; i < channelTotal; i++) {
//		int index = i * channelTotal + threadId;
//		sharedSteerVector_0[index] = index;
//		sharedSteerVector_1[index] = index;
//	}

	int i = threadIndex / width;
	int j = threadIndex % width;

	float Temp1 = .0f;
	for (int k=0; k < channelTotal; k++)
	{
		//		float a = sharedX[j] - sharedPX[k] * 0.001f;
		//        float b = sharedY[i] - sharedPY[k] * 0.001f;
		float a = x[j] - px[k] * 0.001f;
		float b = y[i] - py[k] * 0.001f;

		float c = a * b;


		int index = k * channelTotal + threadId;
		sharedSteerVector_0[index] = c;
		sharedSteerVector_1[index] = c;

//		int index = k * channelTotal + threadId;
//		sharedSteerVector_0[index] = index;
//		sharedSteerVector_1[index] = index;

	}
	Temp1 += sharedSteerVector_0[0] + sharedSteerVector_1[0];


	resultData[threadIndex] = Temp1;
}

int main()
{
	int times = 100;
	cudaDeviceSetLimit(cudaLimitMallocHeapSize, 128*1024*1024);


	//float* data = (float*)malloc(CHANNEL_TOTAL * sizeof (float));
	float* data = nullptr;
	checkCudaErrors(cudaMallocHost(&data, CHANNEL_TOTAL * sizeof (float), cudaHostAllocMapped));
	float* dataOut = nullptr;
	checkCudaErrors(cudaMallocHost(&dataOut, CHANNEL_TOTAL * sizeof (float), cudaHostAllocMapped));
	for (int i = 0; i < CHANNEL_TOTAL; i++) {
		data[i] = i;
	}

	float* x = nullptr;
	checkCudaErrors(cudaMallocHost(&x, CHANNEL_TOTAL * sizeof (float), cudaHostAllocMapped));
	float* y = nullptr;
	checkCudaErrors(cudaMallocHost(&y, CHANNEL_TOTAL * sizeof (float), cudaHostAllocMapped));
	float* px = nullptr;
	checkCudaErrors(cudaMallocHost(&px, WIDTH * sizeof (float), cudaHostAllocMapped));
	float* py = nullptr;
	checkCudaErrors(cudaMallocHost(&py, HEIGHT * sizeof (float), cudaHostAllocMapped));

	fftwf_complex* rxData = nullptr;
	checkCudaErrors(cudaMallocHost(&rxData, CHANNEL_TOTAL * CHANNEL_TOTAL * sizeof (fftwf_complex), cudaHostAllocMapped));

	float* result = nullptr;
	checkCudaErrors(cudaMallocHost(&result, WIDTH * HEIGHT * sizeof (float), cudaHostAllocMapped));


	for (int i = 0; i < times; i++) {
		sharedMemoryTest<<<640 * 480 / CHANNEL_TOTAL, CHANNEL_TOTAL>>>(data, dataOut);
		sharedMemoryTest_1<<<640 * 480 / CHANNEL_TOTAL, CHANNEL_TOTAL>>>(data, dataOut);
		sharedMemoryTest_2<<<640 * 480 / CHANNEL_TOTAL, CHANNEL_TOTAL>>>(data, dataOut);

		sound_beamforming_3<WIDTH, HEIGHT, CHANNEL_TOTAL><<<640 * 480 / CHANNEL_TOTAL, CHANNEL_TOTAL>>>(x, y, px, py, rxData, result);

	}
	cudaDeviceSynchronize();
	return 0;
}

complie command:

/usr/local/cuda-10.2/bin/nvcc -ftz=true -prec-div=false -prec-sqrt=false --use_fast_math --compiler-options -fPIC -I"/usr/local/cuda-10.2/include" -I"/usr/local/cuda-10.2/samples/common/inc" -lcuda -lcudadevrt -lcudart -lcublas -lcufft --machine 64 -gencode arch=compute_30,code=sm_30 -gencode arch=compute_32,code=sm_32 -gencode arch=compute_53,code=sm_53 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_62,code=sm_62 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_72,code=sm_72 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_75,code=compute_75 -O0 -c -o test_bank_conflict.o test_bank_conflict.cu 
g++ test_bank_conflict.o -L/usr/local/cuda-10.2/lib64 -lcuda -lcurand -lcudadevrt -lcudart -lcublas -lcufft -o main
sudo $(which nvprof) --metrics sm_efficiency,achieved_occupancy,gld_throughput,gst_throughput,gld_efficiency,gst_efficiency,gld_transactions,gst_transactions,gld_transactions_per_request,gst_transactions_per_request,branch_efficiency,shared_efficiency,shared_load_throughput,shared_store_throughput,shared_store_transactions_per_request,local_load_throughput,local_store_throughput,tex_cache_hit_rate,tex_cache_transactions --events shared_ld_bank_conflict,shared_st_bank_conflict ./main

When I run your (2nd) code under compute-sanitizer it reports many errors such as “Invalid __global__ read of size 4 bytes”. I don’t bother profiling such codes and wouldn’t recommend that to anyone else. Also, in the future, if you have a question about bank conflicts, I suggest narrowing the focus down to a single kernel.

what do you mean ’ Invalid __global__ read’? why invalid memory read not cause crash? why my application did not show any information about this?

because a CUDA error doesn’t work that way. CUDA errors don’t cause application crashes

because you are not properly checking for CUDA errors

__global__ is a GPU memory space. Invalid means you are attempting to do an illegal read. For example, indexing beyond the end of an array.

A few suggestions:

  1. learn about proper CUDA error checking (google “proper CUDA error checking”. Take the first hit. Study it. Apply it to your code.)
  2. learn about compute-sanitizer from the documentation. Use it on your code.
  3. If you wish, a somewhat more comprehensive treatment of CUDA error management is here.

Thank you very much