I am doing some simple benchmarks with distributed shared memory on GH200. There is a 128x128 integer matrix in shared memory. Warps access the matrix either linearly or diagonally, and the accessed matrix can be either in block local shared memory or in the shared memory of the next block in the cluster.
//nvcc -arch=sm_90 -std=c++17 -O3 main.cu -o main
#include <iostream>
#include <thrust/device_vector.h>
#include <cooperative_groups.h>
namespace cg = cooperative_groups;
struct SmemArray{
unsigned int data[128][128];
};
constexpr int blocksize = 512;
template<bool REMOTE_SMEM, bool DIAGONAL_ACCESS>
__global__
void kernel1(unsigned int* output, int iters){
extern __shared__ SmemArray externsmem[];
SmemArray& array = externsmem[0];
__shared__ int blocksum;
if(threadIdx.x == 0){
blocksum = 0;
}
auto cluster = cg::this_cluster();
auto warp = cg::tiled_partition<32>(cg::this_thread_block());
if(threadIdx.x < 128){
for(int row = 0; row < 128; row++){
array.data[row][threadIdx.x] = 1;
}
}
cluster.sync();
SmemArray* srcArray = REMOTE_SMEM ?
cluster.map_shared_rank(&array, (cluster.block_rank() + 1) % cluster.num_blocks()) : &array;
unsigned int mysum = 0;
if constexpr (DIAGONAL_ACCESS){
for(int i = 0; i < iters; i++){
mysum += srcArray->data[(threadIdx.x + i) % 128][warp.thread_rank()];
}
}else{
for(int i = 0; i < iters; i++){
mysum += srcArray->data[i % 128][warp.thread_rank()];
}
}
//don't care about integer overflow, just use the value
atomicAdd(&blocksum, mysum);
__syncthreads();
if(threadIdx.x == 0){
atomicAdd(output, blocksum);
}
cluster.sync();
}
void benchmarkcluster(int timingIterations, int iters, size_t smem, int clustersize){
cudaLaunchConfig_t config = {0};
config.gridDim = ((4096 + clustersize - 1) / clustersize) * clustersize;
config.blockDim = blocksize;
config.dynamicSmemBytes = smem;
// int maxClusterSize = 0;
// cudaOccupancyMaxPotentialClusterSize(&maxClusterSize, kernel1<false, false>, &config);
// std::cout << "maxClusterSize " << maxClusterSize << "\n";
cudaLaunchAttribute attribute[1];
attribute[0].id = cudaLaunchAttributeClusterDimension;
attribute[0].val.clusterDim.x = clustersize;
attribute[0].val.clusterDim.y = 1;
attribute[0].val.clusterDim.z = 1;
config.attrs = &attribute[0];
config.numAttrs = 1;
cudaFuncSetAttribute(kernel1<false, false>, cudaFuncAttributeMaxDynamicSharedMemorySize, smem);
cudaFuncSetAttribute(kernel1<false, true>, cudaFuncAttributeMaxDynamicSharedMemorySize, smem);
cudaFuncSetAttribute(kernel1<true, false>, cudaFuncAttributeMaxDynamicSharedMemorySize, smem);
cudaFuncSetAttribute(kernel1<true, true>, cudaFuncAttributeMaxDynamicSharedMemorySize, smem);
cudaFuncSetAttribute(kernel1<false, false>, cudaFuncAttributeNonPortableClusterSizeAllowed, true);
cudaFuncSetAttribute(kernel1<false, true>, cudaFuncAttributeNonPortableClusterSizeAllowed, true);
cudaFuncSetAttribute(kernel1<true, false>, cudaFuncAttributeNonPortableClusterSizeAllowed, true);
cudaFuncSetAttribute(kernel1<true, true>, cudaFuncAttributeNonPortableClusterSizeAllowed, true);
int numBlocksPerSM = 0;
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&numBlocksPerSM,
kernel1<false, false>,
blocksize,
smem
);
std::cout << "benchmarkcluster. smem: " << smem << ", numBlocksPerSM: " << numBlocksPerSM << ", clustersize " << clustersize << "\n";
thrust::device_vector<unsigned int> d_output(1);
cudaEvent_t eventA, eventB;
cudaEventCreate(&eventA);
cudaEventCreate(&eventB);
float elapsed = 0;
{
constexpr bool REMOTE_SMEM = false;
constexpr bool DIAGONAL_ACCESS = false;
std::cout << "own smem, linear access\n";
auto kernel = kernel1<REMOTE_SMEM, DIAGONAL_ACCESS>;
for(int i = 0; i < timingIterations; i++){
d_output[0] = 0;
cudaEventRecord(eventA);
cudaLaunchKernelEx(&config, kernel, d_output.data().get(), iters);
cudaEventRecord(eventB);
cudaDeviceSynchronize();
cudaEventElapsedTime(&elapsed, eventA, eventB);
std::cout << "elapsed: " << elapsed << ", output " << d_output[0] << "\n";
}
}
{
constexpr bool REMOTE_SMEM = false;
constexpr bool DIAGONAL_ACCESS = true;
std::cout << "own smem, diagonal access\n";
auto kernel = kernel1<REMOTE_SMEM, DIAGONAL_ACCESS>;
for(int i = 0; i < timingIterations; i++){
d_output[0] = 0;
cudaEventRecord(eventA);
cudaLaunchKernelEx(&config, kernel, d_output.data().get(), iters);
cudaEventRecord(eventB);
cudaDeviceSynchronize();
cudaEventElapsedTime(&elapsed, eventA, eventB);
std::cout << "elapsed: " << elapsed << ", output " << d_output[0] << "\n";
}
}
{
constexpr bool REMOTE_SMEM = true;
constexpr bool DIAGONAL_ACCESS = false;
std::cout << "remote smem, linear access\n";
auto kernel = kernel1<REMOTE_SMEM, DIAGONAL_ACCESS>;
for(int i = 0; i < timingIterations; i++){
d_output[0] = 0;
cudaEventRecord(eventA);
cudaLaunchKernelEx(&config, kernel, d_output.data().get(), iters);
cudaEventRecord(eventB);
cudaDeviceSynchronize();
cudaEventElapsedTime(&elapsed, eventA, eventB);
std::cout << "elapsed: " << elapsed << ", output " << d_output[0] << "\n";
}
}
{
constexpr bool REMOTE_SMEM = true;
constexpr bool DIAGONAL_ACCESS = true;
std::cout << "remote smem, diagonal access\n";
auto kernel = kernel1<REMOTE_SMEM, DIAGONAL_ACCESS>;
for(int i = 0; i < timingIterations; i++){
d_output[0] = 0;
cudaEventRecord(eventA);
cudaLaunchKernelEx(&config, kernel, d_output.data().get(), iters);
cudaEventRecord(eventB);
cudaDeviceSynchronize();
cudaEventElapsedTime(&elapsed, eventA, eventB);
std::cout << "elapsed: " << elapsed << ", output " << d_output[0] << "\n";
}
}
cudaEventDestroy(eventA);
cudaEventDestroy(eventB);
}
int main(int argc, char** argv){
int timingIterations = 5;
int clustersizeParam = 2;
const int iters = 100000;
if(argc > 1){
timingIterations = std::atoi(argv[1]);
}
if(argc > 2){
clustersizeParam = std::atoi(argv[2]);
}
for(int c = 1; c <= 16; c++){
benchmarkcluster(timingIterations, iters, 200000, c);
}
}
I measured the following timings (milliseconds):
clustersize | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
own smem, linear | 38 | 42 | 41 | 45 | 48 | 47 | 41 | 60 | 69 | 63 | 57 | 54 | 49 | 47 | 43 |
own smem, diagonal | 49 | 55 | 53 | 58 | 62 | 61 | 53 | 77 | 89 | 82 | 74 | 69 | 63 | 60 | 56 |
remote smem, linear | 276 | 351 | 347 | 377 | 386 | 386 | 350 | 474 | 517 | 487 | 457 | 441 | 431 | 405 | 380 |
remote smem, diagonal | 2503 | 3201 | 3162 | 3454 | 3537 | 3551 | 3173 | 4445 | 4741 | 4413 | 4221 | 4087 | 3956 | 3802 | 3380 |
The access pattern should have no bank conflicts in a warp.
For diagonal access within the same block I guess the overhead comes from accessing different 128-byte cache lines. However, I cannot explain why accesses to a remote block are 9 times slower with diagonal access.
Where does the overhead come from?
Do the access rules differ between local shared memory and remote shared memory?
I did not find any information about this in the documentation or best practices guide.