I have some code that could be simplified like this:
#include <stdint.h>
__device__ __forceinline__ static uint laneId()
{
uint ret;
asm("mov.u32 %0, %%laneid;" : "=r"(ret));
return ret;
}
__device__ __forceinline__ static uint warpId()
{
const uint tid = (threadIdx.z * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x;
return tid / 32;
}
template <typename T, typename S>
__device__ inline void store_as(S *p, const T& x)
{
*reinterpret_cast<T *>(p) = x;
}
template <unsigned int N>
__device__ inline void store_uint16_vector(uint16_t *dest, const uint32_t *ptr);
template <>
__device__ inline void store_uint16_vector<1u>(uint16_t *dest, const uint32_t *ptr)
{
dest[0] = static_cast<uint16_t>(ptr[0]);
}
template <>
__device__ inline void store_uint16_vector<2u>(uint16_t *dest, const uint32_t *ptr)
{
ushort2 uint16x2;
uint16x2.x = static_cast<uint16_t>(ptr[0]);
uint16x2.y = static_cast<uint16_t>(ptr[1]);
store_as<ushort2>(dest, uint16x2);
}
template <>
__device__ inline void store_uint16_vector<4u>(uint16_t *dest, const uint32_t *ptr)
{
ushort4 uint16x4;
uint16x4.x = static_cast<uint16_t>(ptr[0]);
uint16x4.y = static_cast<uint16_t>(ptr[1]);
uint16x4.z = static_cast<uint16_t>(ptr[2]);
uint16x4.w = static_cast<uint16_t>(ptr[3]);
store_as<ushort4>(dest, uint16x4);
}
template <>
__device__ inline void store_uint16_vector<8u>(uint16_t *dest, const uint32_t *ptr)
{
uint4 uint32x4;
store_uint16_vector<2u>(reinterpret_cast<uint16_t *>(&uint32x4.x), &ptr[0]);
store_uint16_vector<2u>(reinterpret_cast<uint16_t *>(&uint32x4.y), &ptr[2]);
store_uint16_vector<2u>(reinterpret_cast<uint16_t *>(&uint32x4.z), &ptr[4]);
store_uint16_vector<2u>(reinterpret_cast<uint16_t *>(&uint32x4.w), &ptr[6]);
store_as<uint4>(dest, uint32x4);
}
__global__ void kernel()
{
const unsigned int warpid = warpId();
const unsigned int laneid = laneId();
__shared__ uint16_t smem[8][4][128 + 8];
const unsigned int k = laneid * 16;
const unsigned int k_hi = k / 128;
const unsigned int k_lo = k % 128;
uint32_t sum[16];
store_uint16_vector<8u>(&smem[warpid][k_hi][k_lo], sum);
__syncwarp();
}
The store function is translated into a single STS.128 instruction. Without the +8 padding, one would expect 8-way bank conflicts, which is what nsight compute showed initially. However, after the padding I would expect 4-way bank conflicts:
| Lane | row | col | offset (in multiples of 32b) | base_bank | banks touched |
|---|---|---|---|---|---|
| 0 | 0 | 0 | 0 | 0 | {0,1,2,3} |
| 1 | 0 | 1 | 8 | 8 | {8,9,10,11} |
| 2 | 0 | 2 | 16 | 16 | {16,17,18,19} |
| 3 | 0 | 3 | 24 | 24 | {24,25,26,27} |
| 4 | 0 | 4 | 32 | 0 | {0,1,2,3} |
| 5 | 0 | 5 | 40 | 8 | {8,9,10,11} |
| 6 | 0 | 6 | 48 | 16 | {16,17,18,19} |
| 7 | 0 | 7 | 56 | 24 | {24,25,26,27} |
| 8 | 1 | 0 | 68 | 4 | {4,5,6,7} |
| 9 | 1 | 1 | 76 | 12 | {12,13,14,15} |
| 10 | 1 | 2 | 84 | 20 | {20,21,22,23} |
| 11 | 1 | 3 | 92 | 28 | {28,29,30,31} |
| 12 | 1 | 4 | 100 | 4 | {4,5,6,7} |
| 13 | 1 | 5 | 108 | 12 | {12,13,14,15} |
| 14 | 1 | 6 | 116 | 20 | {20,21,22,23} |
| 15 | 1 | 7 | 124 | 28 | {28,29,30,31} |
| 16 | 2 | 0 | 136 | 8 | {8,9,10,11} |
| 17 | 2 | 1 | 144 | 16 | {16,17,18,19} |
| 18 | 2 | 2 | 152 | 24 | {24,25,26,27} |
| 19 | 2 | 3 | 160 | 0 | {0,1,2,3} |
| 20 | 2 | 4 | 168 | 8 | {8,9,10,11} |
| 21 | 2 | 5 | 176 | 16 | {16,17,18,19} |
| 22 | 2 | 6 | 184 | 24 | {24,25,26,27} |
| 23 | 2 | 7 | 192 | 0 | {0,1,2,3} |
| 24 | 3 | 0 | 204 | 12 | {12,13,14,15} |
| 25 | 3 | 1 | 212 | 20 | {20,21,22,23} |
| 26 | 3 | 2 | 220 | 28 | {28,29,30,31} |
| 27 | 3 | 3 | 228 | 4 | {4,5,6,7} |
| 28 | 3 | 4 | 236 | 12 | {12,13,14,15} |
| 29 | 3 | 5 | 244 | 20 | {20,21,22,23} |
| 30 | 3 | 6 | 252 | 28 | {28,29,30,31} |
| 31 | 3 | 7 | 260 | 4 | {4,5,6,7} |
In bank 0 only lanes 0, 4, 19 and 23 are colliding, so 4-way is what I thought was the bank conflict here. However, when running and profiling inside nsight compute I still got 8-way, which seems wrong, but the timing was maintained so maybe it is correct. The problem is that, if it is indeed 8-way, why do my computations revealed 4-way? What am I missing? Is the STS.128 instructions doing something under the hood that I do not understand? Is my analysis wrong? I would like to understand bank conflicts properly in this case.