I want to load 128B (64 half) * 8 data from shared memory to register. Each thread have 16 half register. Here is a simple way to do that (just for example, have bank conflict, assume only have one warp):
__global__ f(int* ptr, ...) {
__shared__ half src[64][64];
half dst[16];
int lane_id = threadIdx.x;
int group_id = lane_id / 4;
int tid_in_group = lane_id % 4;
for (int i = 0; i < 8; i++) {
dst[i * 2] = src[ptr[group_id]][i * 8 + tid_in_group * 2];
dst[i * 2 + 1] = src[ptr[group_id]][i * 8 + tid_in_group * 2 + 1];
}
}
The code will cause bank conflict. To avoid that, I try to load data in cycle:
__global__ f(int* ptr, ...) {
__shared__ half src[64][64];
half dst[16];
int lane_id = threadIdx.x;
int group_id = lane_id / 4;
int tid_in_group = lane_id % 4;
for (int i = 0; i < 8; i++) {
cycle_i = (i + group_id) % 8;
dst[cycle_i * 2] = src[ptr[group_id]][cycle_i * 8 + tid_in_group * 2];
dst[cycle_i * 2 + 1] = src[ptr[group_id]][cycle_i * 8 + tid_in_group * 2 + 1];
}
}
but it will cause the use of local memory because the register used can’t be determined at compile time. Is there any way to both avoid bank conflict and spill?