Hi, I noticed that in A100, each SMSP has 8 LD/ST units. How can it load 32 float in a cycle?
Also, I write a program of looping LDS instructions to measure the bandwidth of shared memory. The #block is set to 1 and #warp is set from 1 to 9. (#warp>9 would pose an error.)
#include <stdio.h>
#include <unistd.h>
#include <assert.h>
__global__ void loop_lds_kernel(const unsigned long long int t) {
volatile __shared__ float smem[32];
float r;
const unsigned long long int tf = 16;
const unsigned long long int ti = t / tf;
for (unsigned long long int i = 0; i < ti; ++i) {
#pragma unroll
for (int j = 0; j < tf; ++j)
r = smem[threadIdx.x];
}
return;
}
int main() {
int gpu_rank = 0;
cudaSetDevice(gpu_rank);
cudaDeviceProp deviceProp{};
cudaGetDeviceProperties(&deviceProp, gpu_rank);
double cycles_per_nanosecond = (double)deviceProp.clockRate / (double)1e6;
printf("GPU CLOCK: %lf cycle/ns\n", cycles_per_nanosecond);
const unsigned long long int n_lds = 100000000;
for (int n_warp = 1; n_warp <=9; n_warp++) {
cudaEvent_t start, stop;
float elapsedTime;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start,0);
loop_lds_kernel<<<1, n_warp * 32>>>(n_lds);
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start,stop);
cudaError_t err = cudaGetLastError();
assert(err == cudaSuccess);
printf("warps per SM: %u bandwidth: %lf ops/cycle\n", n_warp, n_warp * (double)n_lds / (elapsedTime * 1e6 * cycles_per_nanosecond));
//printf("%lf, ", n_warp * (double)n_lds / (elapsedTime * 1e6 * cycles_per_nanosecond));
cudaDeviceSynchronize();
}
}
Compile it with nvcc main.cu -gencode=arch=compute_80,code=\"sm_80,compute_80\". The SASS code is too long and I post a snippet below.
/*0380*/ LDS R5, [R0.X4] ; /* 0x0000000000057984 */
/* 0x0000280000004800 */
/*0390*/ LDS R6, [R0.X4] ; /* 0x0000000000067984 */
/* 0x0000280000004800 */
/*03a0*/ LDS R7, [R0.X4] ; /* 0x0000000000077984 */
/* 0x0000280000004800 */
/*03b0*/ LDS R8, [R0.X4] ; /* 0x0000000000087984 */
/* 0x0000280000004800 */
/*03c0*/ LDS R9, [R0.X4] ; /* 0x0000000000097984 */
/* 0x0000280000004800 */
/*03d0*/ LDS R10, [R0.X4] ; /* 0x00000000000a7984 */
/* 0x0000280000004800 */
/*03e0*/ LDS R11, [R0.X4] ; /* 0x00000000000b7984 */
/* 0x0000280000004800 */
/*03f0*/ LDS R12, [R0.X4] ; /* 0x00000000000c7984 */
/* 0x0000280000004800 */
/*0400*/ LDS R13, [R0.X4] ; /* 0x00000000000d7984 */
/* 0x0000280000004800 */
/*0410*/ LDS R14, [R0.X4] ; /* 0x00000000000e7984 */
/* 0x0000280000004800 */
/*0420*/ LDS R15, [R0.X4] ; /* 0x00000000000f7984 */
/* 0x0000280000004800 */
/*0430*/ LDS R16, [R0.X4] ; /* 0x0000000000107984 */
/* 0x0000280000004800 */
/*0440*/ LDS R17, [R0.X4] ; /* 0x0000000000117984 */
/* 0x0000280000004800 */
/*0450*/ LDS R18, [R0.X4] ; /* 0x0000000000127984 */
/* 0x0000280000004800 */
/*0460*/ LDS R19, [R0.X4] ; /* 0x0000000000137984 */
/* 0x0000280000004800 */
/*0470*/ LDS R5, [R0.X4] ; /* 0x0000000000057984 */
/* 0x0000280000004800 */
/*0480*/ LDS R6, [R0.X4] ; /* 0x0000000000067984 */
/* 0x0000280000004800 */
/*0490*/ LDS R7, [R0.X4] ; /* 0x0000000000077984 */
/* 0x0000280000004800 */
/*04a0*/ LDS R8, [R0.X4] ; /* 0x0000000000087984 */
/* 0x0000280000004800 */
/*04b0*/ LDS R9, [R0.X4] ; /* 0x0000000000097984 */
/* 0x0000280000004800 */
/*04c0*/ LDS R10, [R0.X4] ; /* 0x00000000000a7984 */
/* 0x0000280000004800 */
/*04d0*/ LDS R11, [R0.X4] ; /* 0x00000000000b7984 */
/* 0x0000280000004800 */
/*04e0*/ LDS R12, [R0.X4] ; /* 0x00000000000c7984 */
/* 0x0000280000004800 */
/*04f0*/ LDS R13, [R0.X4] ; /* 0x00000000000d7984 */
/* 0x0000280000004800 */
/*0500*/ LDS R14, [R0.X4] ; /* 0x00000000000e7984 */
/* 0x0000280000004800 */
/*0510*/ LDS R15, [R0.X4] ; /* 0x00000000000f7984 */
/* 0x0000280000004800 */
/*0520*/ LDS R16, [R0.X4] ; /* 0x0000000000107984 */
/* 0x0000280000004800 */
/*0530*/ LDS R17, [R0.X4] ; /* 0x0000000000117984 */
/* 0x0000280000004800 */
/*0540*/ LDS R18, [R0.X4] ; /* 0x0000000000127984 */
/* 0x0000280000004800 */
/*0550*/ LDS R19, [R0.X4] ; /* 0x0000000000137984 */
/* 0x0000280000004800 */
/*0560*/ LDS R20, [R0.X4] ; /* 0x0000000000147984 */
/* 0x0000280000004800 */
/*0570*/ LDS R21, [R0.X4] ; /* 0x0000000000157984 */
/* 0x0000280000004800 */
/*0580*/ LDS R22, [R0.X4] ; /* 0x0000000000167984 */
/* 0x0000280000004800 */
/*0590*/ LDS R23, [R0.X4] ; /* 0x0000000000177984 */
/* 0x0000280000004800 */
/*05a0*/ LDS R24, [R0.X4] ; /* 0x0000000000187984 */
/* 0x0000280000004800 */
/*05b0*/ LDS R25, [R0.X4] ; /* 0x0000000000197984 */
/* 0x0000280000004800 */
/*05c0*/ LDS R26, [R0.X4] ; /* 0x00000000001a7984 */
/* 0x0000280000004800 */
/*05d0*/ LDS R27, [R0.X4] ; /* 0x00000000001b7984 */
/* 0x0000280000004800 */
/*05e0*/ LDS R28, [R0.X4] ; /* 0x00000000001c7984 */
/* 0x0000280000004800 */
/*05f0*/ LDS R29, [R0.X4] ; /* 0x00000000001d7984 */
/* 0x0000280000004800 */
/*0600*/ LDS R5, [R0.X4] ; /* 0x0000000000057984 */
/* 0x0000280000004800 */
/*0610*/ LDS R6, [R0.X4] ; /* 0x0000000000067984 */
/* 0x0000280000004800 */
/*0620*/ LDS R7, [R0.X4] ; /* 0x0000000000077984 */
/* 0x0000280000004800 */
/*0630*/ LDS R8, [R0.X4] ; /* 0x0000000000087984 */
/* 0x0000280000004800 */
/*0640*/ LDS R9, [R0.X4] ; /* 0x0000000000097984 */
/* 0x0000280000004800 */
/*0650*/ LDS R10, [R0.X4] ; /* 0x00000000000a7984 */
/* 0x0000280000004800 */
/*0660*/ LDS R11, [R0.X4] ; /* 0x00000000000b7984 */
/* 0x0000280000004800 */
/*0670*/ LDS R12, [R0.X4] ; /* 0x00000000000c7984 */
/* 0x0000280000004800 */
/*0680*/ LDS R13, [R0.X4] ; /* 0x00000000000d7984 */
/* 0x0000280000004800 */
/*0690*/ LDS R14, [R0.X4] ; /* 0x00000000000e7984 */
/* 0x0000280000004800 */
/*06a0*/ LDS R15, [R0.X4] ; /* 0x00000000000f7984 */
/* 0x0000280000004800 */
/*06b0*/ LDS R16, [R0.X4] ; /* 0x0000000000107984 */
/* 0x0000280000004800 */
/*06c0*/ LDS R17, [R0.X4] ; /* 0x0000000000117984 */
/* 0x0000280000004800 */
/*06d0*/ LDS R18, [R0.X4] ; /* 0x0000000000127984 */
/* 0x0000280000004800 */
/*06e0*/ LDS R19, [R0.X4] ; /* 0x0000000000137984 */
/* 0x0000280000004800 */
/*06f0*/ LDS R5, [R0.X4] ; /* 0x0000000000057984 */
/* 0x0000280000004800 */
/*0700*/ LDS R6, [R0.X4] ; /* 0x0000000000067984 */
/* 0x0000280000004800 */
/*0710*/ LDS R7, [R0.X4] ; /* 0x0000000000077984 */
/* 0x0000280000004800 */
/*0720*/ LDS R8, [R0.X4] ; /* 0x0000000000087984 */
/* 0x0000280000004800 */
/*0730*/ LDS R9, [R0.X4] ; /* 0x0000000000097984 */
/* 0x0000280000004800 */
/*0740*/ LDS R10, [R0.X4] ; /* 0x00000000000a7984 */
/* 0x0000280000004800 */
/*0750*/ LDS R11, [R0.X4] ; /* 0x00000000000b7984 */
/* 0x0000280000004800 */
/*0760*/ LDS R12, [R0.X4] ; /* 0x00000000000c7984 */
/* 0x0000280000004800 */
/*0770*/ LDS R13, [R0.X4] ; /* 0x00000000000d7984 */
/* 0x0000280000004800 */
/*0780*/ LDS R14, [R0.X4] ; /* 0x00000000000e7984 */
/* 0x0000280000004800 */
/*0790*/ LDS R15, [R0.X4] ; /* 0x00000000000f7984 */
/* 0x0000280000004800 */
/*07a0*/ LDS R16, [R0.X4] ; /* 0x0000000000107984 */
/* 0x0000280000004800 */
/*07b0*/ LDS R17, [R0.X4] ; /* 0x0000000000117984 */
/* 0x0000280000004800 */
/*07c0*/ LDS R18, [R0.X4] ; /* 0x0000000000127984 */
/* 0x0000280000004800 */
/*07d0*/ LDS R19, [R0.X4] ; /* 0x0000000000137984 */
/* 0x0000280000004800 */
/*07e0*/ LDS R20, [R0.X4] ; /* 0x0000000000147984 */
/* 0x0000280000004800 */
/*07f0*/ LDS R21, [R0.X4] ; /* 0x0000000000157984 */
/* 0x0000280000004800 */
/*0800*/ LDS R22, [R0.X4] ; /* 0x0000000000167984 */
/* 0x0000280000004800 */
/*0810*/ LDS R23, [R0.X4] ; /* 0x0000000000177984 */
/* 0x0000280000004800 */
/*0820*/ LDS R24, [R0.X4] ; /* 0x0000000000187984 */
/* 0x0000280000004800 */
/*0830*/ LDS R25, [R0.X4] ; /* 0x0000000000197984 */
/* 0x0000280000004800 */
/*0840*/ LDS R26, [R0.X4] ; /* 0x00000000001a7984 */
/* 0x0000280000004800 */
/*0850*/ LDS R27, [R0.X4] ; /* 0x00000000001b7984 */
/* 0x0000280000004800 */
/*0860*/ LDS R28, [R0.X4] ; /* 0x00000000001c7984 */
/* 0x0000280000004800 */
/*0870*/ LDS R29, [R0.X4] ; /* 0x00000000001d7984 */
/* 0x0000280000004800 */
/*0880*/ LDS R5, [R0.X4] ; /* 0x0000000000057984 */
/* 0x0000280000004800 */
/*0890*/ LDS R6, [R0.X4] ; /* 0x0000000000067984 */
/* 0x0000280000004800 */
/*08a0*/ LDS R7, [R0.X4] ; /* 0x0000000000077984 */
/* 0x0000280000004800 */
/*08b0*/ LDS R8, [R0.X4] ; /* 0x0000000000087984 */
/* 0x0000280000004800 */
/*08c0*/ LDS R9, [R0.X4] ; /* 0x0000000000097984 */
/* 0x0000280000004800 */
/*08d0*/ LDS R10, [R0.X4] ; /* 0x00000000000a7984 */
/* 0x0000280000004800 */
/*08e0*/ LDS R11, [R0.X4] ; /* 0x00000000000b7984 */
/* 0x0000280000004800 */
/*08f0*/ LDS R12, [R0.X4] ; /* 0x00000000000c7984 */
/* 0x0000280000004800 */
/*0900*/ LDS R13, [R0.X4] ; /* 0x00000000000d7984 */
/* 0x0000280000004800 */
/*0910*/ LDS R14, [R0.X4] ; /* 0x00000000000e7984 */
/* 0x0000280000004800 */
/*0920*/ LDS R15, [R0.X4] ; /* 0x00000000000f7984 */
/* 0x0000280000004800 */
/*0930*/ LDS R16, [R0.X4] ; /* 0x0000000000107984 */
/* 0x0000280000004800 */
/*0940*/ LDS R17, [R0.X4] ; /* 0x0000000000117984 */
/* 0x0000280000004800 */
/*0950*/ LDS R18, [R0.X4] ; /* 0x0000000000127984 */
/* 0x0000280000004800 */
/*0960*/ LDS R19, [R0.X4] ; /* 0x0000000000137984 */
/* 0x0000280000004800 */
/*0970*/ LDS R5, [R0.X4] ; /* 0x0000000000057984 */
/* 0x0000280000004800 */
/*0980*/ LDS R6, [R0.X4] ; /* 0x0000000000067984 */
/* 0x0000280000004800 */
/*0990*/ LDS R7, [R0.X4] ; /* 0x0000000000077984 */
/* 0x0000280000004800 */
/*09a0*/ LDS R8, [R0.X4] ; /* 0x0000000000087984 */
/* 0x0000280000004800 */
/*09b0*/ LDS R9, [R0.X4] ; /* 0x0000000000097984 */
/* 0x0000280000004800 */
/*09c0*/ LDS R10, [R0.X4] ; /* 0x00000000000a7984 */
/* 0x0000280000004800 */
/*09d0*/ LDS R11, [R0.X4] ; /* 0x00000000000b7984 */
/* 0x0000280000004800 */
/*09e0*/ LDS R12, [R0.X4] ; /* 0x00000000000c7984 */
/* 0x0000280000004800 */
/*09f0*/ LDS R13, [R0.X4] ; /* 0x00000000000d7984 */
/* 0x0000280000004800 */
/*0a00*/ LDS R14, [R0.X4] ; /* 0x00000000000e7984 */
/* 0x0000280000004800 */
/*0a10*/ LDS R15, [R0.X4] ; /* 0x00000000000f7984 */
/* 0x0000280000004800 */
/*0a20*/ LDS R16, [R0.X4] ; /* 0x0000000000107984 */
/* 0x0000280000004800 */
/*0a30*/ LDS R17, [R0.X4] ; /* 0x0000000000117984 */
/* 0x0000280000004800 */
/*0a40*/ LDS R18, [R0.X4] ; /* 0x0000000000127984 */
/* 0x0000280000004800 */
/*0a50*/ LDS R19, [R0.X4] ; /* 0x0000000000137984 */
/* 0x0000280000004800 */
/*0a60*/ LDS R20, [R0.X4] ; /* 0x0000000000147984 */
/* 0x0000280000004800 */
/*0a70*/ LDS R21, [R0.X4] ; /* 0x0000000000157984 */
/* 0x0000280000004800 */
/*0a80*/ LDS R22, [R0.X4] ; /* 0x0000000000167984 */
/* 0x0000280000004800 */
/*0a90*/ LDS R23, [R0.X4] ; /* 0x0000000000177984 */
/* 0x0000280000004800 */
/*0aa0*/ LDS R24, [R0.X4] ; /* 0x0000000000187984 */
/* 0x0000280000004800 */
/*0ab0*/ LDS R25, [R0.X4] ; /* 0x0000000000197984 */
/* 0x0000280000004800 */
/*0ac0*/ LDS R26, [R0.X4] ; /* 0x00000000001a7984 */
/* 0x0000280000004800 */
/*0ad0*/ LDS R27, [R0.X4] ; /* 0x00000000001b7984 */
/* 0x0000280000004800 */
/*0ae0*/ LDS R28, [R0.X4] ; /* 0x00000000001c7984 */
/* 0x0000280000004800 */
/*0af0*/ LDS R29, [R0.X4] ; /* 0x00000000001d7984 */
/* 0x0000280000004800 */
/*0b00*/ LDS R5, [R0.X4] ; /* 0x0000000000057984 */
/* 0x0000280000004800 */
/*0b10*/ LDS R6, [R0.X4] ; /* 0x0000000000067984 */
/* 0x0000280000004800 */
/*0b20*/ LDS R7, [R0.X4] ; /* 0x0000000000077984 */
/* 0x0000280000004800 */
/*0b30*/ LDS R8, [R0.X4] ; /* 0x0000000000087984 */
/* 0x0000280000004800 */
/*0b40*/ LDS R9, [R0.X4] ; /* 0x0000000000097984 */
/* 0x0000280000004800 */
/*0b50*/ LDS R10, [R0.X4] ; /* 0x00000000000a7984 */
/* 0x0000280000004800 */
/*0b60*/ LDS R11, [R0.X4] ; /* 0x00000000000b7984 */
/* 0x0000280000004800 */
/*0b70*/ LDS R12, [R0.X4] ; /* 0x00000000000c7984 */
/* 0x0000280000004800 */
/*0b80*/ LDS R13, [R0.X4] ; /* 0x00000000000d7984 */
/* 0x0000280000004800 */
/*0b90*/ LDS R14, [R0.X4] ; /* 0x00000000000e7984 */
/* 0x0000280000004800 */
/*0ba0*/ LDS R15, [R0.X4] ; /* 0x00000000000f7984 */
/* 0x0000280000004800 */
/*0bb0*/ LDS R16, [R0.X4] ; /* 0x0000000000107984 */
/* 0x0000280000004800 */
/*0bc0*/ LDS R17, [R0.X4] ; /* 0x0000000000117984 */
/* 0x0000280000004800 */
/*0bd0*/ LDS R18, [R0.X4] ; /* 0x0000000000127984 */
/* 0x0000280000004800 */
/*0be0*/ LDS R19, [R0.X4] ; /* 0x0000000000137984 */
/* 0x0000280000004800 */
/*0bf0*/ LDS R5, [R0.X4] ; /* 0x0000000000057984 */
/* 0x0000280000004800 */
/*0c00*/ LDS R6, [R0.X4] ; /* 0x0000000000067984 */
/* 0x0000280000004800 */
/*0c10*/ LDS R7, [R0.X4] ; /* 0x0000000000077984 */
/* 0x0000280000004800 */
/*0c20*/ LDS R8, [R0.X4] ; /* 0x0000000000087984 */
/* 0x0000280000004800 */
/*0c30*/ LDS R9, [R0.X4] ; /* 0x0000000000097984 */
/* 0x0000280000004800 */
/*0c40*/ LDS R10, [R0.X4] ; /* 0x00000000000a7984 */
/* 0x0000280000004800 */
/*0c50*/ LDS R11, [R0.X4] ; /* 0x00000000000b7984 */
/* 0x0000280000004800 */
/*0c60*/ LDS R12, [R0.X4] ; /* 0x00000000000c7984 */
/* 0x0000280000004800 */
/*0c70*/ LDS R13, [R0.X4] ; /* 0x00000000000d7984 */
/* 0x0000280000004800 */
/*0c80*/ LDS R14, [R0.X4] ; /* 0x00000000000e7984 */
/* 0x0000280000004800 */
/*0c90*/ LDS R15, [R0.X4] ; /* 0x00000000000f7984 */
/* 0x0000280000004800 */
/*0ca0*/ LDS R16, [R0.X4] ; /* 0x0000000000107984 */
/* 0x0000280000004800 */
/*0cb0*/ LDS R17, [R0.X4] ; /* 0x0000000000117984 */
/* 0x0000280000004800 */
/*0cc0*/ LDS R18, [R0.X4] ; /* 0x0000000000127984 */
/* 0x0000280000004800 */
/*0cd0*/ LDS R19, [R0.X4] ; /* 0x0000000000137984 */
/* 0x0000280000004800 */
/*0ce0*/ LDS R20, [R0.X4] ; /* 0x0000000000147984 */
/* 0x0000280000004800 */
/*0cf0*/ LDS R21, [R0.X4] ; /* 0x0000000000157984 */
/* 0x0000280000004800 */
/*0d00*/ LDS R22, [R0.X4] ; /* 0x0000000000167984 */
/* 0x0000280000004800 */
/*0d10*/ LDS R23, [R0.X4] ; /* 0x0000000000177984 */
/* 0x0000280000004800 */
/*0d20*/ LDS R24, [R0.X4] ; /* 0x0000000000187984 */
/* 0x0000280000004800 */
/*0d30*/ LDS R25, [R0.X4] ; /* 0x0000000000197984 */
/* 0x0000280000004800 */
/*0d40*/ LDS R26, [R0.X4] ; /* 0x00000000001a7984 */
/* 0x0000280000004800 */
/*0d50*/ LDS R27, [R0.X4] ; /* 0x00000000001b7984 */
/* 0x0000280000004800 */
/*0d60*/ LDS R28, [R0.X4] ; /* 0x00000000001c7984 */
/* 0x0000280000004800 */
/*0d70*/ LDS R29, [R0.X4] ; /* 0x00000000001d7984 */
/* 0x0000280000004800 */
/*0d80*/ LDS R5, [R0.X4] ; /* 0x0000000000057984 */
/* 0x0000280000004800 */
/*0d90*/ LDS R6, [R0.X4] ; /* 0x0000000000067984 */
/* 0x0000280000004800 */
/*0da0*/ LDS R7, [R0.X4] ; /* 0x0000000000077984 */
/* 0x0000280000004800 */
/*0db0*/ LDS R8, [R0.X4] ; /* 0x0000000000087984 */
/* 0x0000280000004800 */
/*0dc0*/ LDS R9, [R0.X4] ; /* 0x0000000000097984 */
/* 0x0000280000004800 */
/*0dd0*/ LDS R10, [R0.X4] ; /* 0x00000000000a7984 */
/* 0x0000280000004800 */
/*0de0*/ LDS R11, [R0.X4] ; /* 0x00000000000b7984 */
/* 0x0000280000004800 */
/*0df0*/ LDS R12, [R0.X4] ; /* 0x00000000000c7984 */
/* 0x0000280000004800 */
/*0e00*/ LDS R13, [R0.X4] ; /* 0x00000000000d7984 */
/* 0x0000280000004800 */
/*0e10*/ LDS R14, [R0.X4] ; /* 0x00000000000e7984 */
/* 0x0000280000004800 */
/*0e20*/ LDS R15, [R0.X4] ; /* 0x00000000000f7984 */
/* 0x0000280000004800 */
/*0e30*/ LDS R16, [R0.X4] ; /* 0x0000000000107984 */
/* 0x0000280000004800 */
/*0e40*/ LDS R17, [R0.X4] ; /* 0x0000000000117984 */
/* 0x0000280000004800 */
/*0e50*/ LDS R18, [R0.X4] ; /* 0x0000000000127984 */
/* 0x0000280000004800 */
/*0e60*/ LDS R19, [R0.X4] ; /* 0x0000000000137984 */
/* 0x0000280000004800 */
/*0e70*/ LDS R5, [R0.X4] ; /* 0x0000000000057984 */
/* 0x0000280000004800 */
/*0e80*/ LDS R6, [R0.X4] ; /* 0x0000000000067984 */
/* 0x0000280000004800 */
/*0e90*/ LDS R7, [R0.X4] ; /* 0x0000000000077984 */
/* 0x0000280000004800 */
/*0ea0*/ LDS R8, [R0.X4] ; /* 0x0000000000087984 */
/* 0x0000280000004800 */
/*0eb0*/ LDS R9, [R0.X4] ; /* 0x0000000000097984 */
/* 0x0000280000004800 */
/*0ec0*/ LDS R10, [R0.X4] ; /* 0x00000000000a7984 */
/* 0x0000280000004800 */
/*0ed0*/ LDS R11, [R0.X4] ; /* 0x00000000000b7984 */
/* 0x0000280000004800 */
/*0ee0*/ LDS R12, [R0.X4] ; /* 0x00000000000c7984 */
/* 0x0000280000004800 */
/*0ef0*/ LDS R13, [R0.X4] ; /* 0x00000000000d7984 */
/* 0x0000280000004800 */
/*0f00*/ LDS R14, [R0.X4] ; /* 0x00000000000e7984 */
/* 0x0000280000004800 */
/*0f10*/ LDS R15, [R0.X4] ; /* 0x00000000000f7984 */
/* 0x0000280000004800 */
/*0f20*/ LDS R16, [R0.X4] ; /* 0x0000000000107984 */
/* 0x0000280000004800 */
/*0f30*/ LDS R17, [R0.X4] ; /* 0x0000000000117984 */
/* 0x0000280000004800 */
/*0f40*/ LDS R18, [R0.X4] ; /* 0x0000000000127984 */
/* 0x0000280000004800 */
/*0f50*/ LDS R19, [R0.X4] ; /* 0x0000000000137984 */
/* 0x0000280000004800 */
/*0f60*/ LDS R20, [R0.X4] ; /* 0x0000000000147984 */
/* 0x0000280000004800 */
/*0f70*/ LDS R21, [R0.X4] ; /* 0x0000000000157984 */
/* 0x0000280000004800 */
/*0f80*/ LDS R22, [R0.X4] ; /* 0x0000000000167984 */
/* 0x0000280000004800 */
/*0f90*/ LDS R23, [R0.X4] ; /* 0x0000000000177984 */
/* 0x0000280000004800 */
/*0fa0*/ LDS R24, [R0.X4] ; /* 0x0000000000187984 */
/* 0x0000280000004800 */
/*0fb0*/ LDS R25, [R0.X4] ; /* 0x0000000000197984 */
/* 0x0000280000004800 */
/*0fc0*/ LDS R26, [R0.X4] ; /* 0x00000000001a7984 */
/* 0x0000280000004800 */
/*0fd0*/ LDS R27, [R0.X4] ; /* 0x00000000001b7984 */
/* 0x0000280000004800 */
/*0fe0*/ LDS R28, [R0.X4] ; /* 0x00000000001c7984 */
/* 0x0000280000004800 */
/*0ff0*/ LDS R29, [R0.X4] ; /* 0x00000000001d7984 */
/* 0x0000280000004800 */
/*1000*/ LDS R5, [R0.X4] ; /* 0x0000000000057984 */
/* 0x0000280000004800 */
/*1010*/ LDS R6, [R0.X4] ; /* 0x0000000000067984 */
/* 0x0000280000004800 */
/*1020*/ LDS R7, [R0.X4] ; /* 0x0000000000077984 */
/* 0x0000280000004800 */
/*1030*/ LDS R8, [R0.X4] ; /* 0x0000000000087984 */
/* 0x0000280000004800 */
/*1040*/ LDS R9, [R0.X4] ; /* 0x0000000000097984 */
/* 0x0000280000004800 */
/*1050*/ LDS R10, [R0.X4] ; /* 0x00000000000a7984 */
/* 0x0000280000004800 */
/*1060*/ LDS R11, [R0.X4] ; /* 0x00000000000b7984 */
/* 0x0000280000004800 */
/*1070*/ LDS R12, [R0.X4] ; /* 0x00000000000c7984 */
/* 0x0000280000004800 */
/*1080*/ LDS R13, [R0.X4] ; /* 0x00000000000d7984 */
/* 0x0000280000004800 */
/*1090*/ LDS R14, [R0.X4] ; /* 0x00000000000e7984 */
/* 0x0000280000004800 */
/*10a0*/ LDS R15, [R0.X4] ; /* 0x00000000000f7984 */
/* 0x0000280000004800 */
/*10b0*/ LDS R16, [R0.X4] ; /* 0x0000000000107984 */
/* 0x0000280000004800 */
/*10c0*/ LDS R17, [R0.X4] ; /* 0x0000000000117984 */
/* 0x0000280000004800 */
/*10d0*/ LDS R18, [R0.X4] ; /* 0x0000000000127984 */
/* 0x0000280000004800 */
/*10e0*/ LDS R19, [R0.X4] ; /* 0x0000000000137984 */
/* 0x0000280000004800 */
..........
The instructions are highly independent which result in good ILP. I run it on A100 and post the result below.

My questions are:
- Why the “load operations per cycle” approximately increases by 0.25 as #warp increasing from 1 to 4? Take #warp=1 for example, if we only have one warp in a SMSP it seems that it takes 4 cycles to complete a bank-conflict-free LDS instruction. AFAIK, a warp should has the ability of executing a LDS instruction and getting 128-bit data each cycle when access is bank conflict free.
- Why #warp=5 and #warp=9 has a little drop in LDS bandwidth (compared to #warp=4 and #warp=8)? AFAIK the more threads the larger throughput.
