Performance of diagonal access to distributed shared memory

I am doing some simple benchmarks with distributed shared memory on GH200. There is a 128x128 integer matrix in shared memory. Warps access the matrix either linearly or diagonally, and the accessed matrix can be either in block local shared memory or in the shared memory of the next block in the cluster.

//nvcc -arch=sm_90 -std=c++17 -O3 main.cu -o main
#include <iostream>
#include <thrust/device_vector.h>
#include <cooperative_groups.h>
namespace cg = cooperative_groups;

struct SmemArray{
	unsigned int data[128][128];
};

constexpr int blocksize = 512;


template<bool REMOTE_SMEM, bool DIAGONAL_ACCESS>
__global__
void kernel1(unsigned int* output, int iters){
	extern __shared__ SmemArray externsmem[];	
	SmemArray& array = externsmem[0];

    __shared__ int blocksum;
    if(threadIdx.x == 0){
        blocksum = 0;
    }
	
	auto cluster = cg::this_cluster();
    auto warp = cg::tiled_partition<32>(cg::this_thread_block());

    if(threadIdx.x < 128){
        for(int row = 0; row < 128; row++){
            array.data[row][threadIdx.x] = 1;
        }
    }
    cluster.sync();

    SmemArray* srcArray = REMOTE_SMEM ? 
        cluster.map_shared_rank(&array, (cluster.block_rank() + 1) % cluster.num_blocks()) : &array;

    unsigned int mysum = 0;

    if constexpr (DIAGONAL_ACCESS){
        for(int i = 0; i < iters; i++){
            mysum += srcArray->data[(threadIdx.x + i) % 128][warp.thread_rank()];
        }
    }else{
        for(int i = 0; i < iters; i++){
            mysum += srcArray->data[i % 128][warp.thread_rank()];
        }
    }

    //don't care about integer overflow, just use the value
    atomicAdd(&blocksum, mysum);
    __syncthreads();
    if(threadIdx.x == 0){
        atomicAdd(output, blocksum);
    }

    cluster.sync();
}



void benchmarkcluster(int timingIterations, int iters, size_t smem, int clustersize){
    cudaLaunchConfig_t config = {0};
    config.gridDim = ((4096 + clustersize - 1) / clustersize) * clustersize;
    config.blockDim = blocksize;
    config.dynamicSmemBytes = smem;

    // int maxClusterSize = 0;
    // cudaOccupancyMaxPotentialClusterSize(&maxClusterSize, kernel1<false, false>, &config);
    // std::cout << "maxClusterSize " << maxClusterSize << "\n";

    cudaLaunchAttribute attribute[1];
    attribute[0].id = cudaLaunchAttributeClusterDimension;
    attribute[0].val.clusterDim.x = clustersize;
    attribute[0].val.clusterDim.y = 1;
    attribute[0].val.clusterDim.z = 1;
    config.attrs = &attribute[0];
    config.numAttrs = 1; 

    cudaFuncSetAttribute(kernel1<false, false>, cudaFuncAttributeMaxDynamicSharedMemorySize, smem);
    cudaFuncSetAttribute(kernel1<false, true>, cudaFuncAttributeMaxDynamicSharedMemorySize, smem);
    cudaFuncSetAttribute(kernel1<true, false>, cudaFuncAttributeMaxDynamicSharedMemorySize, smem);
    cudaFuncSetAttribute(kernel1<true, true>, cudaFuncAttributeMaxDynamicSharedMemorySize, smem);

    cudaFuncSetAttribute(kernel1<false, false>, cudaFuncAttributeNonPortableClusterSizeAllowed, true);
    cudaFuncSetAttribute(kernel1<false, true>, cudaFuncAttributeNonPortableClusterSizeAllowed, true);
    cudaFuncSetAttribute(kernel1<true, false>, cudaFuncAttributeNonPortableClusterSizeAllowed, true);
    cudaFuncSetAttribute(kernel1<true, true>, cudaFuncAttributeNonPortableClusterSizeAllowed, true);

    int numBlocksPerSM = 0;
    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
        &numBlocksPerSM,
        kernel1<false, false>,
        blocksize,
        smem
    );

    std::cout << "benchmarkcluster. smem: " << smem << ", numBlocksPerSM: " << numBlocksPerSM << ", clustersize " << clustersize << "\n";

    thrust::device_vector<unsigned int> d_output(1);
    cudaEvent_t eventA, eventB;
    cudaEventCreate(&eventA);
    cudaEventCreate(&eventB);
    float elapsed = 0;

    {
        constexpr bool REMOTE_SMEM = false;
        constexpr bool DIAGONAL_ACCESS = false;
        std::cout << "own smem, linear access\n";
        auto kernel = kernel1<REMOTE_SMEM, DIAGONAL_ACCESS>;

        for(int i = 0; i < timingIterations; i++){
            d_output[0] = 0;
            cudaEventRecord(eventA);
            cudaLaunchKernelEx(&config, kernel, d_output.data().get(), iters);
            cudaEventRecord(eventB);
            cudaDeviceSynchronize();
            cudaEventElapsedTime(&elapsed, eventA, eventB);
            std::cout << "elapsed: " << elapsed << ", output " << d_output[0] << "\n";
        }
    }
    {
        constexpr bool REMOTE_SMEM = false;
        constexpr bool DIAGONAL_ACCESS = true;
        std::cout << "own smem, diagonal access\n";
        auto kernel = kernel1<REMOTE_SMEM, DIAGONAL_ACCESS>;

        for(int i = 0; i < timingIterations; i++){
            d_output[0] = 0;
            cudaEventRecord(eventA);
            cudaLaunchKernelEx(&config, kernel, d_output.data().get(), iters);
            cudaEventRecord(eventB);
            cudaDeviceSynchronize();
            cudaEventElapsedTime(&elapsed, eventA, eventB);
            std::cout << "elapsed: " << elapsed << ", output " << d_output[0] << "\n";
        }
    }
    {
        constexpr bool REMOTE_SMEM = true;
        constexpr bool DIAGONAL_ACCESS = false;
        std::cout << "remote smem, linear access\n";
        auto kernel = kernel1<REMOTE_SMEM, DIAGONAL_ACCESS>;

        for(int i = 0; i < timingIterations; i++){
            d_output[0] = 0;
            cudaEventRecord(eventA);
            cudaLaunchKernelEx(&config, kernel, d_output.data().get(), iters);
            cudaEventRecord(eventB);
            cudaDeviceSynchronize();
            cudaEventElapsedTime(&elapsed, eventA, eventB);
            std::cout << "elapsed: " << elapsed << ", output " << d_output[0] << "\n";
        }
    }
    {
        constexpr bool REMOTE_SMEM = true;
        constexpr bool DIAGONAL_ACCESS = true;
        std::cout << "remote smem, diagonal access\n";
        auto kernel = kernel1<REMOTE_SMEM, DIAGONAL_ACCESS>;

        for(int i = 0; i < timingIterations; i++){
            d_output[0] = 0;
            cudaEventRecord(eventA);
            cudaLaunchKernelEx(&config, kernel, d_output.data().get(), iters);
            cudaEventRecord(eventB);
            cudaDeviceSynchronize();
            cudaEventElapsedTime(&elapsed, eventA, eventB);
            std::cout << "elapsed: " << elapsed << ", output " << d_output[0] << "\n";
        }
    }

    cudaEventDestroy(eventA);
    cudaEventDestroy(eventB);
}





int main(int argc, char** argv){
    int timingIterations = 5;
    int clustersizeParam = 2;
    const int iters = 100000;

    if(argc > 1){
        timingIterations = std::atoi(argv[1]);
    }
    if(argc > 2){
        clustersizeParam = std::atoi(argv[2]);
    }

    for(int c = 1; c <= 16; c++){
        benchmarkcluster(timingIterations, iters, 200000, c);
    }
}

I measured the following timings (milliseconds):

clustersize 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
own smem, linear 38 42 41 45 48 47 41 60 69 63 57 54 49 47 43
own smem, diagonal 49 55 53 58 62 61 53 77 89 82 74 69 63 60 56
remote smem, linear 276 351 347 377 386 386 350 474 517 487 457 441 431 405 380
remote smem, diagonal 2503 3201 3162 3454 3537 3551 3173 4445 4741 4413 4221 4087 3956 3802 3380

The access pattern should have no bank conflicts in a warp.
For diagonal access within the same block I guess the overhead comes from accessing different 128-byte cache lines. However, I cannot explain why accesses to a remote block are 9 times slower with diagonal access.

Where does the overhead come from?
Do the access rules differ between local shared memory and remote shared memory?
I did not find any information about this in the documentation or best practices guide.

I was able to reproduce the observation with your code (not surprising) on a H100 GPU.

I don’t have any precise explanation for it that I can share publicly. However when I shared your inquiry with some knowledgeable colleagues, they were not surprised by the observation - so it is basically expected behavior, not a bug or coding defect.

Beyond that I don’t have any specifics to share. If you would like additional clarification, the only suggestion I can offer at this time is to file a bug to request doc clarification of distributed shared memory performance expectations.

It is self evident based on your test that the expectations for local shared memory performance based on access pattern do not completely transfer to distributed shared memory. coalesced/grouped/contiguous access evidently has a performance benefit with distributed shared memory.

2 Likes

Each thread block has shared memory visible to all threads of the block and with the same lifetime as the block. Thread blocks in a thread block cluster can perform read, write, and atomics operations on each other’s shared memory.

This section and the section in the NVIDIA Hopper Tuning Guide do not provide details on access patterns. In GH100 the distributed shared memory does not have the benefits of local shared memory that allows each bank to access a different row. Distributed shared memory access go through the SM L1 cache the same as a global memory operation but do not go to GPU L2 reducing latency. Distributed shared memory has access pattern performance closer to global memory.

3 Likes

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.