Performance of diagonal access to distributed shared memory

I am doing some simple benchmarks with distributed shared memory on GH200. There is a 128x128 integer matrix in shared memory. Warps access the matrix either linearly or diagonally, and the accessed matrix can be either in block local shared memory or in the shared memory of the next block in the cluster.

//nvcc -arch=sm_90 -std=c++17 -O3 main.cu -o main
#include <iostream>
#include <thrust/device_vector.h>
#include <cooperative_groups.h>
namespace cg = cooperative_groups;

struct SmemArray{
	unsigned int data[128][128];
};

constexpr int blocksize = 512;


template<bool REMOTE_SMEM, bool DIAGONAL_ACCESS>
__global__
void kernel1(unsigned int* output, int iters){
	extern __shared__ SmemArray externsmem[];	
	SmemArray& array = externsmem[0];

    __shared__ int blocksum;
    if(threadIdx.x == 0){
        blocksum = 0;
    }
	
	auto cluster = cg::this_cluster();
    auto warp = cg::tiled_partition<32>(cg::this_thread_block());

    if(threadIdx.x < 128){
        for(int row = 0; row < 128; row++){
            array.data[row][threadIdx.x] = 1;
        }
    }
    cluster.sync();

    SmemArray* srcArray = REMOTE_SMEM ? 
        cluster.map_shared_rank(&array, (cluster.block_rank() + 1) % cluster.num_blocks()) : &array;

    unsigned int mysum = 0;

    if constexpr (DIAGONAL_ACCESS){
        for(int i = 0; i < iters; i++){
            mysum += srcArray->data[(threadIdx.x + i) % 128][warp.thread_rank()];
        }
    }else{
        for(int i = 0; i < iters; i++){
            mysum += srcArray->data[i % 128][warp.thread_rank()];
        }
    }

    //don't care about integer overflow, just use the value
    atomicAdd(&blocksum, mysum);
    __syncthreads();
    if(threadIdx.x == 0){
        atomicAdd(output, blocksum);
    }

    cluster.sync();
}



void benchmarkcluster(int timingIterations, int iters, size_t smem, int clustersize){
    cudaLaunchConfig_t config = {0};
    config.gridDim = ((4096 + clustersize - 1) / clustersize) * clustersize;
    config.blockDim = blocksize;
    config.dynamicSmemBytes = smem;

    // int maxClusterSize = 0;
    // cudaOccupancyMaxPotentialClusterSize(&maxClusterSize, kernel1<false, false>, &config);
    // std::cout << "maxClusterSize " << maxClusterSize << "\n";

    cudaLaunchAttribute attribute[1];
    attribute[0].id = cudaLaunchAttributeClusterDimension;
    attribute[0].val.clusterDim.x = clustersize;
    attribute[0].val.clusterDim.y = 1;
    attribute[0].val.clusterDim.z = 1;
    config.attrs = &attribute[0];
    config.numAttrs = 1; 

    cudaFuncSetAttribute(kernel1<false, false>, cudaFuncAttributeMaxDynamicSharedMemorySize, smem);
    cudaFuncSetAttribute(kernel1<false, true>, cudaFuncAttributeMaxDynamicSharedMemorySize, smem);
    cudaFuncSetAttribute(kernel1<true, false>, cudaFuncAttributeMaxDynamicSharedMemorySize, smem);
    cudaFuncSetAttribute(kernel1<true, true>, cudaFuncAttributeMaxDynamicSharedMemorySize, smem);

    cudaFuncSetAttribute(kernel1<false, false>, cudaFuncAttributeNonPortableClusterSizeAllowed, true);
    cudaFuncSetAttribute(kernel1<false, true>, cudaFuncAttributeNonPortableClusterSizeAllowed, true);
    cudaFuncSetAttribute(kernel1<true, false>, cudaFuncAttributeNonPortableClusterSizeAllowed, true);
    cudaFuncSetAttribute(kernel1<true, true>, cudaFuncAttributeNonPortableClusterSizeAllowed, true);

    int numBlocksPerSM = 0;
    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
        &numBlocksPerSM,
        kernel1<false, false>,
        blocksize,
        smem
    );

    std::cout << "benchmarkcluster. smem: " << smem << ", numBlocksPerSM: " << numBlocksPerSM << ", clustersize " << clustersize << "\n";

    thrust::device_vector<unsigned int> d_output(1);
    cudaEvent_t eventA, eventB;
    cudaEventCreate(&eventA);
    cudaEventCreate(&eventB);
    float elapsed = 0;

    {
        constexpr bool REMOTE_SMEM = false;
        constexpr bool DIAGONAL_ACCESS = false;
        std::cout << "own smem, linear access\n";
        auto kernel = kernel1<REMOTE_SMEM, DIAGONAL_ACCESS>;

        for(int i = 0; i < timingIterations; i++){
            d_output[0] = 0;
            cudaEventRecord(eventA);
            cudaLaunchKernelEx(&config, kernel, d_output.data().get(), iters);
            cudaEventRecord(eventB);
            cudaDeviceSynchronize();
            cudaEventElapsedTime(&elapsed, eventA, eventB);
            std::cout << "elapsed: " << elapsed << ", output " << d_output[0] << "\n";
        }
    }
    {
        constexpr bool REMOTE_SMEM = false;
        constexpr bool DIAGONAL_ACCESS = true;
        std::cout << "own smem, diagonal access\n";
        auto kernel = kernel1<REMOTE_SMEM, DIAGONAL_ACCESS>;

        for(int i = 0; i < timingIterations; i++){
            d_output[0] = 0;
            cudaEventRecord(eventA);
            cudaLaunchKernelEx(&config, kernel, d_output.data().get(), iters);
            cudaEventRecord(eventB);
            cudaDeviceSynchronize();
            cudaEventElapsedTime(&elapsed, eventA, eventB);
            std::cout << "elapsed: " << elapsed << ", output " << d_output[0] << "\n";
        }
    }
    {
        constexpr bool REMOTE_SMEM = true;
        constexpr bool DIAGONAL_ACCESS = false;
        std::cout << "remote smem, linear access\n";
        auto kernel = kernel1<REMOTE_SMEM, DIAGONAL_ACCESS>;

        for(int i = 0; i < timingIterations; i++){
            d_output[0] = 0;
            cudaEventRecord(eventA);
            cudaLaunchKernelEx(&config, kernel, d_output.data().get(), iters);
            cudaEventRecord(eventB);
            cudaDeviceSynchronize();
            cudaEventElapsedTime(&elapsed, eventA, eventB);
            std::cout << "elapsed: " << elapsed << ", output " << d_output[0] << "\n";
        }
    }
    {
        constexpr bool REMOTE_SMEM = true;
        constexpr bool DIAGONAL_ACCESS = true;
        std::cout << "remote smem, diagonal access\n";
        auto kernel = kernel1<REMOTE_SMEM, DIAGONAL_ACCESS>;

        for(int i = 0; i < timingIterations; i++){
            d_output[0] = 0;
            cudaEventRecord(eventA);
            cudaLaunchKernelEx(&config, kernel, d_output.data().get(), iters);
            cudaEventRecord(eventB);
            cudaDeviceSynchronize();
            cudaEventElapsedTime(&elapsed, eventA, eventB);
            std::cout << "elapsed: " << elapsed << ", output " << d_output[0] << "\n";
        }
    }

    cudaEventDestroy(eventA);
    cudaEventDestroy(eventB);
}





int main(int argc, char** argv){
    int timingIterations = 5;
    int clustersizeParam = 2;
    const int iters = 100000;

    if(argc > 1){
        timingIterations = std::atoi(argv[1]);
    }
    if(argc > 2){
        clustersizeParam = std::atoi(argv[2]);
    }

    for(int c = 1; c <= 16; c++){
        benchmarkcluster(timingIterations, iters, 200000, c);
    }
}

I measured the following timings (milliseconds):

clustersize 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
own smem, linear 38 42 41 45 48 47 41 60 69 63 57 54 49 47 43
own smem, diagonal 49 55 53 58 62 61 53 77 89 82 74 69 63 60 56
remote smem, linear 276 351 347 377 386 386 350 474 517 487 457 441 431 405 380
remote smem, diagonal 2503 3201 3162 3454 3537 3551 3173 4445 4741 4413 4221 4087 3956 3802 3380

The access pattern should have no bank conflicts in a warp.
For diagonal access within the same block I guess the overhead comes from accessing different 128-byte cache lines. However, I cannot explain why accesses to a remote block are 9 times slower with diagonal access.

Where does the overhead come from?
Do the access rules differ between local shared memory and remote shared memory?
I did not find any information about this in the documentation or best practices guide.