Why store middle result into shared memory has higher delay than store into local memory?

HI,

I have used nvprof to check if there is bank conflicts, and the result is no.


template<unsigned int width, unsigned int height, unsigned int channelTotal>
__global__ void sound_beamforming_5(float* x, float* y, float* px, float* py,
									fftwf_complex* m_rxDatas,
									float* resultData)
{
	int blockId = blockIdx.z * gridDim.y * gridDim.x + blockIdx.y * gridDim.x + blockIdx.x;
	int threadId = threadIdx.z * blockDim.y * blockDim.x + threadIdx.y * blockDim.x + threadIdx.x;
	int blockSize = blockDim.x * blockDim.y * blockDim.z;
	int threadIndex = (blockId) * (blockDim.x * blockDim.y * blockDim.z) + threadId;

	__shared__ float sharedPX[channelTotal];
	__shared__ float sharedPY[channelTotal];
//	__shared__ float sharedX[width];
//	__shared__ float sharedY[height];
//	__shared__ float sharedSteerVector_0[channelTotal * channelTotal];
//	__shared__ float sharedSteerVector_1[channelTotal * channelTotal];
//	__shared__ float rxDatas_0[channelTotal * channelTotal];
//	__shared__ float rxDatas_1[channelTotal * channelTotal];
	__shared__ cufftComplex RX[channelTotal];


	cufftComplex steer_vector[ARRAY_CHANNEL_NUM];

	sharedPX[threadId] = px[threadId];
	sharedPY[threadId] = py[threadId];


	int i = threadIndex / width;
	int j = threadIndex % width;

	float Temp1 = .0f;
	for (int k=0; k < channelTotal; k++)
	{
//		float a = sharedX[j] - sharedPX[k] * 0.001f;
//		float b = sharedY[i] - sharedPY[k] * 0.001f;
//		float a = x[j] - px[k] * 0.001f;
//		float b = y[i] - py[k] * 0.001f;
		float a = x[j] - sharedPX[k] * 0.001f;
		float b = y[i] - sharedPY[k] * 0.001f;

		float c = a * b;


//		int index = k * channelTotal + threadId;
//		sharedSteerVector_0[index] = c;
//		sharedSteerVector_1[index] = c;

		//		int index = k * channelTotal + threadId;
		//		sharedSteerVector_0[index] = index;
		//		sharedSteerVector_1[index] = index;


		steer_vector[k].x = c;
		steer_vector[k].y = c;
	}

	//	Temp1 += sharedSteerVector_0[0] + sharedSteerVector_1[0];
	for (int ii=0;ii< 32;ii++)
	{
		RX[threadIdx.x].x= m_rxDatas[ii*32+threadIdx.x][0];
		RX[threadIdx.x].y= m_rxDatas[ii*32+threadIdx.x][1];

		float Sum_0 = .0f;
		float Sum_1 = .0f;
		for(int jj=0;jj< 32;jj++)
		{
			float S1_0 = .0f;
			float S1_1 = .0f;

//			int index = jj * channelTotal + threadId;
//			S1_0 = sharedSteerVector_0[index];
//			S1_1 = - sharedSteerVector_1[index];


			S1_0 = steer_vector[jj].x;
			S1_1 = - steer_vector[jj].y;

			float S2_0 = .0f;
			float S2_1 = .0f;

			//			 S2_0 = m_rxDatas[jj * 32 + ii][0];
			//			 S2_1 = m_rxDatas[jj * 32 + ii][1];
//			S2_0 = rxDatas_0[jj * channelTotal + ii];
//			S2_1 = rxDatas_1[jj * channelTotal + ii];

			S2_0 = RX[jj].x;
			S2_1 = RX[jj].y;

//			S2_0 = m_rxDatas[jj * channelTotal + ii][0];
//			S2_1 = m_rxDatas[jj * channelTotal + ii][1];

			Sum_0 += (S1_0 * S2_0 - S1_1 * S2_1);
			Sum_1 += (S1_0 * S2_1 + S1_1 * S2_0);

		}
		float S1_0 = .0f;
		float S1_1 = .0f;

//		int index = ii * channelTotal + threadId;
//		S1_0 = sharedSteerVector_0[index];
//		S1_1 = - sharedSteerVector_1[index];

		S1_0 = steer_vector[ii].x;
		S1_1 = - steer_vector[ii].y;

		Temp1 += Sum_0 * S1_0 - Sum_1 * S1_1;
	}

	resultData[threadIndex] = Temp1;
}

template<unsigned int width, unsigned int height, unsigned int channelTotal>
__global__ void sound_beamforming_6(float* x, float* y, float* px, float* py,
									fftwf_complex* m_rxDatas,
									float* resultData)
{
	int blockId = blockIdx.z * gridDim.y * gridDim.x + blockIdx.y * gridDim.x + blockIdx.x;
	int threadId = threadIdx.z * blockDim.y * blockDim.x + threadIdx.y * blockDim.x + threadIdx.x;
	int blockSize = blockDim.x * blockDim.y * blockDim.z;
	int threadIndex = (blockId) * (blockDim.x * blockDim.y * blockDim.z) + threadId;

	__shared__ float sharedPX[channelTotal];
	__shared__ float sharedPY[channelTotal];
//	__shared__ float sharedX[width];
//	__shared__ float sharedY[height];
	__shared__ cufftComplex sharedSteerVector[channelTotal * channelTotal];
//	__shared__ float rxDatas_0[channelTotal * channelTotal];
//	__shared__ float rxDatas_1[channelTotal * channelTotal];
	__shared__ cufftComplex RX[channelTotal];

	sharedPX[threadId] = px[threadId];
	sharedPY[threadId] = py[threadId];


	int i = threadIndex / width;
	int j = threadIndex % width;

	float Temp1 = .0f;
	for (int k=0; k < channelTotal; k++)
	{
//		float a = sharedX[j] - sharedPX[k] * 0.001f;
//		float b = sharedY[i] - sharedPY[k] * 0.001f;
//		float a = x[j] - px[k] * 0.001f;
//		float b = y[i] - py[k] * 0.001f;
		float a = x[j] - sharedPX[k] * 0.001f;
		float b = y[i] - sharedPY[k] * 0.001f;

		float c = a * b;


		int index = k * channelTotal + threadId;
		sharedSteerVector[index].x = c;
		sharedSteerVector[index].y = c;

//		sharedSteerVector_0[index] = c;
//		sharedSteerVector_1[index] = c;

		//		int index = k * channelTotal + threadId;
		//		sharedSteerVector_0[index] = index;
		//		sharedSteerVector_1[index] = index;


//		steer_vector[k].x = c;
//		steer_vector[k].y = c;
	}

	//	Temp1 += sharedSteerVector_0[0] + sharedSteerVector_1[0];
	for (int ii=0;ii< 32;ii++)
	{
		RX[threadIdx.x].x= m_rxDatas[ii*32+threadIdx.x][0];
		RX[threadIdx.x].y= m_rxDatas[ii*32+threadIdx.x][1];

		float Sum_0 = .0f;
		float Sum_1 = .0f;
		for(int jj=0;jj< 32;jj++)
		{
			float S1_0 = .0f;
			float S1_1 = .0f;

			int index = jj * channelTotal + threadId;
//			S1_0 = sharedSteerVector_0[index];
//			S1_1 = - sharedSteerVector_1[index];

			S1_0 = sharedSteerVector[index].x;
			S1_1 = - sharedSteerVector[index].y;

//			S1_0 = steer_vector[jj].x;
//			S1_1 = - steer_vector[jj].y;

			float S2_0 = .0f;
			float S2_1 = .0f;

			//			 S2_0 = m_rxDatas[jj * 32 + ii][0];
			//			 S2_1 = m_rxDatas[jj * 32 + ii][1];
//			S2_0 = rxDatas_0[jj * channelTotal + ii];
//			S2_1 = rxDatas_1[jj * channelTotal + ii];

			S2_0 = RX[jj].x;
			S2_1 = RX[jj].y;

//			S2_0 = m_rxDatas[jj * channelTotal + ii][0];
//			S2_1 = m_rxDatas[jj * channelTotal + ii][1];

			Sum_0 += (S1_0 * S2_0 - S1_1 * S2_1);
			Sum_1 += (S1_0 * S2_1 + S1_1 * S2_0);

		}
		float S1_0 = .0f;
		float S1_1 = .0f;

		int index = ii * channelTotal + threadId;
//		S1_0 = sharedSteerVector_0[index];
//		S1_1 = - sharedSteerVector_1[index];

//		S1_0 = steer_vector[ii].x;
//		S1_1 = - steer_vector[ii].y;


		S1_0 = sharedSteerVector[index].x;
		S1_1 = - sharedSteerVector[index].y;

		Temp1 += Sum_0 * S1_0 - Sum_1 * S1_1;
	}

	resultData[threadIndex] = Temp1;
}

I tested sound_beamforming_5 on jetson TX1, and its delay is about10ms, but sound_beamforming_6 is about 20ms