Performance of diagonal access to distributed shared memory

striker159 · February 18, 2024, 1:52pm

I am doing some simple benchmarks with distributed shared memory on GH200. There is a 128x128 integer matrix in shared memory. Warps access the matrix either linearly or diagonally, and the accessed matrix can be either in block local shared memory or in the shared memory of the next block in the cluster.

//nvcc -arch=sm_90 -std=c++17 -O3 main.cu -o main
#include <iostream>
#include <thrust/device_vector.h>
#include <cooperative_groups.h>
namespace cg = cooperative_groups;

struct SmemArray{
	unsigned int data[128][128];
};

constexpr int blocksize = 512;


template<bool REMOTE_SMEM, bool DIAGONAL_ACCESS>
__global__
void kernel1(unsigned int* output, int iters){
	extern __shared__ SmemArray externsmem[];	
	SmemArray& array = externsmem[0];

    __shared__ int blocksum;
    if(threadIdx.x == 0){
        blocksum = 0;
    }
	
	auto cluster = cg::this_cluster();
    auto warp = cg::tiled_partition<32>(cg::this_thread_block());

    if(threadIdx.x < 128){
        for(int row = 0; row < 128; row++){
            array.data[row][threadIdx.x] = 1;
        }
    }
    cluster.sync();

    SmemArray* srcArray = REMOTE_SMEM ? 
        cluster.map_shared_rank(&array, (cluster.block_rank() + 1) % cluster.num_blocks()) : &array;

    unsigned int mysum = 0;

    if constexpr (DIAGONAL_ACCESS){
        for(int i = 0; i < iters; i++){
            mysum += srcArray->data[(threadIdx.x + i) % 128][warp.thread_rank()];
        }
    }else{
        for(int i = 0; i < iters; i++){
            mysum += srcArray->data[i % 128][warp.thread_rank()];
        }
    }

    //don't care about integer overflow, just use the value
    atomicAdd(&blocksum, mysum);
    __syncthreads();
    if(threadIdx.x == 0){
        atomicAdd(output, blocksum);
    }

    cluster.sync();
}



void benchmarkcluster(int timingIterations, int iters, size_t smem, int clustersize){
    cudaLaunchConfig_t config = {0};
    config.gridDim = ((4096 + clustersize - 1) / clustersize) * clustersize;
    config.blockDim = blocksize;
    config.dynamicSmemBytes = smem;

    // int maxClusterSize = 0;
    // cudaOccupancyMaxPotentialClusterSize(&maxClusterSize, kernel1<false, false>, &config);
    // std::cout << "maxClusterSize " << maxClusterSize << "\n";

    cudaLaunchAttribute attribute[1];
    attribute[0].id = cudaLaunchAttributeClusterDimension;
    attribute[0].val.clusterDim.x = clustersize;
    attribute[0].val.clusterDim.y = 1;
    attribute[0].val.clusterDim.z = 1;
    config.attrs = &attribute[0];
    config.numAttrs = 1; 

    cudaFuncSetAttribute(kernel1<false, false>, cudaFuncAttributeMaxDynamicSharedMemorySize, smem);
    cudaFuncSetAttribute(kernel1<false, true>, cudaFuncAttributeMaxDynamicSharedMemorySize, smem);
    cudaFuncSetAttribute(kernel1<true, false>, cudaFuncAttributeMaxDynamicSharedMemorySize, smem);
    cudaFuncSetAttribute(kernel1<true, true>, cudaFuncAttributeMaxDynamicSharedMemorySize, smem);

    cudaFuncSetAttribute(kernel1<false, false>, cudaFuncAttributeNonPortableClusterSizeAllowed, true);
    cudaFuncSetAttribute(kernel1<false, true>, cudaFuncAttributeNonPortableClusterSizeAllowed, true);
    cudaFuncSetAttribute(kernel1<true, false>, cudaFuncAttributeNonPortableClusterSizeAllowed, true);
    cudaFuncSetAttribute(kernel1<true, true>, cudaFuncAttributeNonPortableClusterSizeAllowed, true);

    int numBlocksPerSM = 0;
    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
        &numBlocksPerSM,
        kernel1<false, false>,
        blocksize,
        smem
    );

    std::cout << "benchmarkcluster. smem: " << smem << ", numBlocksPerSM: " << numBlocksPerSM << ", clustersize " << clustersize << "\n";

    thrust::device_vector<unsigned int> d_output(1);
    cudaEvent_t eventA, eventB;
    cudaEventCreate(&eventA);
    cudaEventCreate(&eventB);
    float elapsed = 0;

    {
        constexpr bool REMOTE_SMEM = false;
        constexpr bool DIAGONAL_ACCESS = false;
        std::cout << "own smem, linear access\n";
        auto kernel = kernel1<REMOTE_SMEM, DIAGONAL_ACCESS>;

        for(int i = 0; i < timingIterations; i++){
            d_output[0] = 0;
            cudaEventRecord(eventA);
            cudaLaunchKernelEx(&config, kernel, d_output.data().get(), iters);
            cudaEventRecord(eventB);
            cudaDeviceSynchronize();
            cudaEventElapsedTime(&elapsed, eventA, eventB);
            std::cout << "elapsed: " << elapsed << ", output " << d_output[0] << "\n";
        }
    }
    {
        constexpr bool REMOTE_SMEM = false;
        constexpr bool DIAGONAL_ACCESS = true;
        std::cout << "own smem, diagonal access\n";
        auto kernel = kernel1<REMOTE_SMEM, DIAGONAL_ACCESS>;

        for(int i = 0; i < timingIterations; i++){
            d_output[0] = 0;
            cudaEventRecord(eventA);
            cudaLaunchKernelEx(&config, kernel, d_output.data().get(), iters);
            cudaEventRecord(eventB);
            cudaDeviceSynchronize();
            cudaEventElapsedTime(&elapsed, eventA, eventB);
            std::cout << "elapsed: " << elapsed << ", output " << d_output[0] << "\n";
        }
    }
    {
        constexpr bool REMOTE_SMEM = true;
        constexpr bool DIAGONAL_ACCESS = false;
        std::cout << "remote smem, linear access\n";
        auto kernel = kernel1<REMOTE_SMEM, DIAGONAL_ACCESS>;

        for(int i = 0; i < timingIterations; i++){
            d_output[0] = 0;
            cudaEventRecord(eventA);
            cudaLaunchKernelEx(&config, kernel, d_output.data().get(), iters);
            cudaEventRecord(eventB);
            cudaDeviceSynchronize();
            cudaEventElapsedTime(&elapsed, eventA, eventB);
            std::cout << "elapsed: " << elapsed << ", output " << d_output[0] << "\n";
        }
    }
    {
        constexpr bool REMOTE_SMEM = true;
        constexpr bool DIAGONAL_ACCESS = true;
        std::cout << "remote smem, diagonal access\n";
        auto kernel = kernel1<REMOTE_SMEM, DIAGONAL_ACCESS>;

        for(int i = 0; i < timingIterations; i++){
            d_output[0] = 0;
            cudaEventRecord(eventA);
            cudaLaunchKernelEx(&config, kernel, d_output.data().get(), iters);
            cudaEventRecord(eventB);
            cudaDeviceSynchronize();
            cudaEventElapsedTime(&elapsed, eventA, eventB);
            std::cout << "elapsed: " << elapsed << ", output " << d_output[0] << "\n";
        }
    }

    cudaEventDestroy(eventA);
    cudaEventDestroy(eventB);
}





int main(int argc, char** argv){
    int timingIterations = 5;
    int clustersizeParam = 2;
    const int iters = 100000;

    if(argc > 1){
        timingIterations = std::atoi(argv[1]);
    }
    if(argc > 2){
        clustersizeParam = std::atoi(argv[2]);
    }

    for(int c = 1; c <= 16; c++){
        benchmarkcluster(timingIterations, iters, 200000, c);
    }
}

I measured the following timings (milliseconds):

clustersize	2	3	4	5	6	7	8	9	10	11	12	13	14	15	16
own smem, linear	38	42	41	45	48	47	41	60	69	63	57	54	49	47	43
own smem, diagonal	49	55	53	58	62	61	53	77	89	82	74	69	63	60	56
remote smem, linear	276	351	347	377	386	386	350	474	517	487	457	441	431	405	380
remote smem, diagonal	2503	3201	3162	3454	3537	3551	3173	4445	4741	4413	4221	4087	3956	3802	3380

The access pattern should have no bank conflicts in a warp.
For diagonal access within the same block I guess the overhead comes from accessing different 128-byte cache lines. However, I cannot explain why accesses to a remote block are 9 times slower with diagonal access.

Where does the overhead come from?
Do the access rules differ between local shared memory and remote shared memory?
I did not find any information about this in the documentation or best practices guide.

Topic		Replies	Views
Distributed Shared Memory vs Shared memory implementation CUDA Programming and Performance cuda , kernel , gpu-computing	7	229	March 19, 2026
Shared memory as slow as global memory CUDA Programming and Performance	8	4655	September 5, 2016
Problem of Distributed Shared Memory CUDA Programming and Performance	1	196	April 23, 2025
Shared mem vs. registers CUDA Programming and Performance	3	1432	October 14, 2009
Local vs Shared Memory execution slows down when using shared memory CUDA Programming and Performance	6	3326	October 14, 2009
No performance inprovement shared mem x global mem CUDA Programming and Performance	5	1271	April 26, 2013
Registers and Shared Memory CUDA Programming and Performance	18	4117	June 16, 2010
shared memory in cuda fortran and increasing time of process Legacy PGI Compilers	1	2088	September 2, 2016
What is the inter-SM linkage of DSM(cluster)? CUDA Programming and Performance	9	857	May 8, 2026
Shared Memory Persistence CUDA Programming and Performance	17	2529	November 25, 2020

Performance of diagonal access to distributed shared memory

Related topics