Performance of diagonal access to distributed shared memory

striker159 · February 18, 2024, 1:52pm

I am doing some simple benchmarks with distributed shared memory on GH200. There is a 128x128 integer matrix in shared memory. Warps access the matrix either linearly or diagonally, and the accessed matrix can be either in block local shared memory or in the shared memory of the next block in the cluster.

//nvcc -arch=sm_90 -std=c++17 -O3 main.cu -o main
#include <iostream>
#include <thrust/device_vector.h>
#include <cooperative_groups.h>
namespace cg = cooperative_groups;

struct SmemArray{
	unsigned int data[128][128];
};

constexpr int blocksize = 512;


template<bool REMOTE_SMEM, bool DIAGONAL_ACCESS>
__global__
void kernel1(unsigned int* output, int iters){
	extern __shared__ SmemArray externsmem[];	
	SmemArray& array = externsmem[0];

    __shared__ int blocksum;
    if(threadIdx.x == 0){
        blocksum = 0;
    }
	
	auto cluster = cg::this_cluster();
    auto warp = cg::tiled_partition<32>(cg::this_thread_block());

    if(threadIdx.x < 128){
        for(int row = 0; row < 128; row++){
            array.data[row][threadIdx.x] = 1;
        }
    }
    cluster.sync();

    SmemArray* srcArray = REMOTE_SMEM ? 
        cluster.map_shared_rank(&array, (cluster.block_rank() + 1) % cluster.num_blocks()) : &array;

    unsigned int mysum = 0;

    if constexpr (DIAGONAL_ACCESS){
        for(int i = 0; i < iters; i++){
            mysum += srcArray->data[(threadIdx.x + i) % 128][warp.thread_rank()];
        }
    }else{
        for(int i = 0; i < iters; i++){
            mysum += srcArray->data[i % 128][warp.thread_rank()];
        }
    }

    //don't care about integer overflow, just use the value
    atomicAdd(&blocksum, mysum);
    __syncthreads();
    if(threadIdx.x == 0){
        atomicAdd(output, blocksum);
    }

    cluster.sync();
}



void benchmarkcluster(int timingIterations, int iters, size_t smem, int clustersize){
    cudaLaunchConfig_t config = {0};
    config.gridDim = ((4096 + clustersize - 1) / clustersize) * clustersize;
    config.blockDim = blocksize;
    config.dynamicSmemBytes = smem;

    // int maxClusterSize = 0;
    // cudaOccupancyMaxPotentialClusterSize(&maxClusterSize, kernel1<false, false>, &config);
    // std::cout << "maxClusterSize " << maxClusterSize << "\n";

    cudaLaunchAttribute attribute[1];
    attribute[0].id = cudaLaunchAttributeClusterDimension;
    attribute[0].val.clusterDim.x = clustersize;
    attribute[0].val.clusterDim.y = 1;
    attribute[0].val.clusterDim.z = 1;
    config.attrs = &attribute[0];
    config.numAttrs = 1; 

    cudaFuncSetAttribute(kernel1<false, false>, cudaFuncAttributeMaxDynamicSharedMemorySize, smem);
    cudaFuncSetAttribute(kernel1<false, true>, cudaFuncAttributeMaxDynamicSharedMemorySize, smem);
    cudaFuncSetAttribute(kernel1<true, false>, cudaFuncAttributeMaxDynamicSharedMemorySize, smem);
    cudaFuncSetAttribute(kernel1<true, true>, cudaFuncAttributeMaxDynamicSharedMemorySize, smem);

    cudaFuncSetAttribute(kernel1<false, false>, cudaFuncAttributeNonPortableClusterSizeAllowed, true);
    cudaFuncSetAttribute(kernel1<false, true>, cudaFuncAttributeNonPortableClusterSizeAllowed, true);
    cudaFuncSetAttribute(kernel1<true, false>, cudaFuncAttributeNonPortableClusterSizeAllowed, true);
    cudaFuncSetAttribute(kernel1<true, true>, cudaFuncAttributeNonPortableClusterSizeAllowed, true);

    int numBlocksPerSM = 0;
    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
        &numBlocksPerSM,
        kernel1<false, false>,
        blocksize,
        smem
    );

    std::cout << "benchmarkcluster. smem: " << smem << ", numBlocksPerSM: " << numBlocksPerSM << ", clustersize " << clustersize << "\n";

    thrust::device_vector<unsigned int> d_output(1);
    cudaEvent_t eventA, eventB;
    cudaEventCreate(&eventA);
    cudaEventCreate(&eventB);
    float elapsed = 0;

    {
        constexpr bool REMOTE_SMEM = false;
        constexpr bool DIAGONAL_ACCESS = false;
        std::cout << "own smem, linear access\n";
        auto kernel = kernel1<REMOTE_SMEM, DIAGONAL_ACCESS>;

        for(int i = 0; i < timingIterations; i++){
            d_output[0] = 0;
            cudaEventRecord(eventA);
            cudaLaunchKernelEx(&config, kernel, d_output.data().get(), iters);
            cudaEventRecord(eventB);
            cudaDeviceSynchronize();
            cudaEventElapsedTime(&elapsed, eventA, eventB);
            std::cout << "elapsed: " << elapsed << ", output " << d_output[0] << "\n";
        }
    }
    {
        constexpr bool REMOTE_SMEM = false;
        constexpr bool DIAGONAL_ACCESS = true;
        std::cout << "own smem, diagonal access\n";
        auto kernel = kernel1<REMOTE_SMEM, DIAGONAL_ACCESS>;

        for(int i = 0; i < timingIterations; i++){
            d_output[0] = 0;
            cudaEventRecord(eventA);
            cudaLaunchKernelEx(&config, kernel, d_output.data().get(), iters);
            cudaEventRecord(eventB);
            cudaDeviceSynchronize();
            cudaEventElapsedTime(&elapsed, eventA, eventB);
            std::cout << "elapsed: " << elapsed << ", output " << d_output[0] << "\n";
        }
    }
    {
        constexpr bool REMOTE_SMEM = true;
        constexpr bool DIAGONAL_ACCESS = false;
        std::cout << "remote smem, linear access\n";
        auto kernel = kernel1<REMOTE_SMEM, DIAGONAL_ACCESS>;

        for(int i = 0; i < timingIterations; i++){
            d_output[0] = 0;
            cudaEventRecord(eventA);
            cudaLaunchKernelEx(&config, kernel, d_output.data().get(), iters);
            cudaEventRecord(eventB);
            cudaDeviceSynchronize();
            cudaEventElapsedTime(&elapsed, eventA, eventB);
            std::cout << "elapsed: " << elapsed << ", output " << d_output[0] << "\n";
        }
    }
    {
        constexpr bool REMOTE_SMEM = true;
        constexpr bool DIAGONAL_ACCESS = true;
        std::cout << "remote smem, diagonal access\n";
        auto kernel = kernel1<REMOTE_SMEM, DIAGONAL_ACCESS>;

        for(int i = 0; i < timingIterations; i++){
            d_output[0] = 0;
            cudaEventRecord(eventA);
            cudaLaunchKernelEx(&config, kernel, d_output.data().get(), iters);
            cudaEventRecord(eventB);
            cudaDeviceSynchronize();
            cudaEventElapsedTime(&elapsed, eventA, eventB);
            std::cout << "elapsed: " << elapsed << ", output " << d_output[0] << "\n";
        }
    }

    cudaEventDestroy(eventA);
    cudaEventDestroy(eventB);
}





int main(int argc, char** argv){
    int timingIterations = 5;
    int clustersizeParam = 2;
    const int iters = 100000;

    if(argc > 1){
        timingIterations = std::atoi(argv[1]);
    }
    if(argc > 2){
        clustersizeParam = std::atoi(argv[2]);
    }

    for(int c = 1; c <= 16; c++){
        benchmarkcluster(timingIterations, iters, 200000, c);
    }
}

I measured the following timings (milliseconds):

clustersize	2	3	4	5	6	7	8	9	10	11	12	13	14	15	16
own smem, linear	38	42	41	45	48	47	41	60	69	63	57	54	49	47	43
own smem, diagonal	49	55	53	58	62	61	53	77	89	82	74	69	63	60	56
remote smem, linear	276	351	347	377	386	386	350	474	517	487	457	441	431	405	380
remote smem, diagonal	2503	3201	3162	3454	3537	3551	3173	4445	4741	4413	4221	4087	3956	3802	3380

The access pattern should have no bank conflicts in a warp.
For diagonal access within the same block I guess the overhead comes from accessing different 128-byte cache lines. However, I cannot explain why accesses to a remote block are 9 times slower with diagonal access.

Where does the overhead come from?
Do the access rules differ between local shared memory and remote shared memory?
I did not find any information about this in the documentation or best practices guide.

Robert_Crovella · February 21, 2024, 7:52pm

I was able to reproduce the observation with your code (not surprising) on a H100 GPU.

I don’t have any precise explanation for it that I can share publicly. However when I shared your inquiry with some knowledgeable colleagues, they were not surprised by the observation - so it is basically expected behavior, not a bug or coding defect.

Beyond that I don’t have any specifics to share. If you would like additional clarification, the only suggestion I can offer at this time is to file a bug to request doc clarification of distributed shared memory performance expectations.

It is self evident based on your test that the expectations for local shared memory performance based on access pattern do not completely transfer to distributed shared memory. coalesced/grouped/contiguous access evidently has a performance benefit with distributed shared memory.

Greg · February 23, 2024, 9:42pm

Each thread block has shared memory visible to all threads of the block and with the same lifetime as the block. Thread blocks in a thread block cluster can perform read, write, and atomics operations on each other’s shared memory.

This section and the section in the NVIDIA Hopper Tuning Guide do not provide details on access patterns. In GH100 the distributed shared memory does not have the benefits of local shared memory that allows each bank to access a different row. Distributed shared memory access go through the SM L1 cache the same as a global memory operation but do not go to GPU L2 reducing latency. Distributed shared memory has access pattern performance closer to global memory.

system · April 9, 2024, 7:02am

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.

Topic		Replies	Views
Doubts related to CUDA CUDA Programming and Performance	17	11822	November 18, 2010
Confusion about setting kernel block and grid size for maximum occupancy CUDA Programming and Performance cuda	11	865	March 30, 2024
Memory copy by two CUDA kernels - why speed differs? CUDA Programming and Performance	10	678	September 28, 2018
Using Shared Memory in CUDA C/C++ Technical Blog	36	2023	October 8, 2020
Thread Block Scheduler uses disjoint SMs for 2 kernels in separate streams CUDA Programming and Performance	3	69	February 3, 2025
Memory problem? ...incredible slowdown CUDA Programming and Performance	29	16355	January 30, 2011
Cuda Latency problems Slow Cuda CUDA Programming and Performance	15	13942	September 5, 2008
Constant memory provides no improvement CUDA Programming and Performance cuda , algorithm	16	111	January 17, 2025
Efficient use of shared memory CUDA Programming and Performance	29	4594	December 2, 2019
Cannot achieve parallel H2D copy and cuBLAS call CUDA Programming and Performance	3	1175	July 28, 2018

Performance of diagonal access to distributed shared memory

Related topics