Dear all,
I have a question concerning shared memory access patterns. The following two kernels resolve the same problem. Both kernels access global memory in a coalesced manner. The fundamental difference is the data layout in shared memory. Kernel 1 has been designed in order to minimize shared memory bank conflicts. As a matter of fact, kernel 1 is conflict free with respect to shared memory accesses. Kernel 2 employs much easier shared memory access patterns. As a consequence, up to 16-way bank conflicts are generated in kernel 2.
Source code kernel 1:
[codebox]global void dictSearchGPU(long int *g_Norm, long int *g_X, long int *g_D){
//Set offsets
unsigned int normThreadId = threadIdx.x & 15;
unsigned int threadGroup = (threadIdx.x - normThreadId) >> 4;
unsigned int offset = 65 * threadGroup;
//Determine component positions within vectors
unsigned int sharedVectorPos = offset + normThreadId;
unsigned int globalVectorPos = 1024 * blockIdx.x + 64 * threadGroup + normThreadId;
//(X_i - S_i_j)^2 calculation
__shared__ long int s_D[1039];
s_D[sharedVectorPos] = g_X[normThreadId] - g_D[globalVectorPos];
s_D[sharedVectorPos] = __mul24(s_D[sharedVectorPos], s_D[sharedVectorPos]);
s_D[sharedVectorPos + 16] = g_X[normThreadId + 16] - g_D[globalVectorPos + 16];
s_D[sharedVectorPos + 16] = __mul24(s_D[sharedVectorPos + 16], s_D[sharedVectorPos + 16]);
s_D[sharedVectorPos + 32] = g_X[normThreadId + 32] - g_D[globalVectorPos + 32];
s_D[sharedVectorPos + 32] = __mul24(s_D[sharedVectorPos + 32], s_D[sharedVectorPos + 32]);
s_D[sharedVectorPos + 48] = g_X[normThreadId + 48] - g_D[globalVectorPos + 48];
s_D[sharedVectorPos + 48] = __mul24(s_D[sharedVectorPos + 48], s_D[sharedVectorPos + 48]);
//Reduction
__syncthreads();
s_D[sharedVectorPos] += s_D[sharedVectorPos + 16];
s_D[sharedVectorPos + 32] += s_D[sharedVectorPos + 48];
s_D[sharedVectorPos] += s_D[sharedVectorPos + 32];
if(normThreadId < 8)
s_D[sharedVectorPos] += s_D[sharedVectorPos + 8];
if(normThreadId < 4)
s_D[sharedVectorPos] += s_D[sharedVectorPos + 4];
if(normThreadId < 2)
s_D[sharedVectorPos] += s_D[sharedVectorPos + 2];
if(normThreadId == 0)
s_D[offset] += s_D[offset + 1];
//Transfer sum results from shared to global memory
__syncthreads();
if(threadIdx.x < 16)
g_Norm[16 * blockIdx.x + threadIdx.x] = s_D[65 * threadIdx.x];
}[/codebox]
Source code kernel 2:
[codebox]global void dictSearchGPU(long int *g_Norm, long int *g_X, long int *g_D){
//Determine component positions within vectors
unsigned int XIndex = threadIdx.x & 63;
unsigned int globalVectorPos = 1024 * blockIdx.x + threadIdx.x;
//(X_i - S_i_j)^2 calculation
__shared__ long int s_D[1024];
s_D[threadIdx.x] = g_X[XIndex] - g_D[globalVectorPos];
s_D[threadIdx.x] = __mul24(s_D[threadIdx.x], s_D[threadIdx.x]);
s_D[threadIdx.x + 256] = g_X[XIndex] - g_D[globalVectorPos + 256];
s_D[threadIdx.x + 256] = __mul24(s_D[threadIdx.x + 256], s_D[threadIdx.x + 256]);
s_D[threadIdx.x + 512] = g_X[XIndex] - g_D[globalVectorPos + 512];
s_D[threadIdx.x + 512] = __mul24(s_D[threadIdx.x + 512], s_D[threadIdx.x + 512]);
s_D[threadIdx.x + 768] = g_X[XIndex] - g_D[globalVectorPos + 768];
s_D[threadIdx.x + 768] = __mul24(s_D[threadIdx.x + 768], s_D[threadIdx.x + 768]);
//Reduction
__syncthreads();
s_D[2 * threadIdx.x] += s_D[2 * threadIdx.x + 1];
s_D[2 * threadIdx.x + 512] += s_D[2 * threadIdx.x + 513];
__syncthreads();
s_D[4 * threadIdx.x] += s_D[4 * threadIdx.x + 2];
__syncthreads();
if(threadIdx.x < 128)
s_D[8 * threadIdx.x] += s_D[8 * threadIdx.x + 4];
__syncthreads();
if(threadIdx.x < 64)
s_D[16 * threadIdx.x] += s_D[16 * threadIdx.x + 8];
__syncthreads();
if(threadIdx.x < 32)
s_D[32 * threadIdx.x] += s_D[32 * threadIdx.x + 16];
__syncthreads();
if(threadIdx.x < 16)
s_D[64 * threadIdx.x] += s_D[64 * threadIdx.x + 32];
//Transfer sum results from shared to global memory
__syncthreads();
if(threadIdx.x < 16)
g_Norm[16 * blockIdx.x + threadIdx.x] = s_D[64 * threadIdx.x];
}[/codebox]
The question is why kernel 2 executes significantly faster (0.558476 msec on a GeForce 9800 GT) than kernel 1 (0.880197 msec) even though it is much less advantageous in terms of shared memory bank conflicts? Analyzing ptxas info shows that in both cases the application can schedule 3 thread blocks (= 768 threads in that case) on each SM.
Kernel 1 ptxas info: Used 10 registers, 4184+28 bytes smem, 20 bytes cmem[1].
Kernel 2 ptxas info: Used 8 registers, 4124+28 bytes smem, 24 bytes cmem[1].
Additional information on the source code:
Input parameters:
long int *g_X: vector of 64 elements (values between 0 and 255)
long int *g_D: vector of 32768 * 64 = 2097152 elements (values between 0 and 255)
Output parameters:
long int *g_Norm: vector of 32768 elements
Execution configuration:
2048 thread blocks of 256 threads each