Trying to launch cluster kernel failed

I only read about the cluster features in the document features before, and this is the first time I try to launch the kernel with cluster configuration on a H100 PCIe, the code is:

#define BLOCKS 8
#include <iostream>
__global__ void __cluster_dims__(4, 1, 1)  test_dsm(int* data, uint64_t* output, int copy_data_len){
    extern __shared__ int dsmem[];
    cg::cluster_group cluster=cg::this_cluster();
    int tid_in_cluster = cg::this_grid().thread_rank();
    int block_id = cluster.block_rank();
    int temp;

    for(size_t i = 1; i <BLOCKS/2; i++)
    {
        uint64_t start_clock;
        if (block_id ==0 && threadIdx.x ==0)
        {
            int *other_block_smem = cluster.map_shared_rank(dsmem, i);
            start_clock=clock64();
            for(size_t j =0;j <copy_data_len;j++)
                temp=other_block_smem[j];
        }
        
        cluster.sync();
        if(block_id ==0 && threadIdx.x == 0)
        {
            uint64_t end_clock=clock64();
            output[i]=end_clock-start_clock;
        }
        
    }
    __syncthreads();
    data[0]=temp;
}
int main(int argc, char* argv[]){
    cudaLaunchConfig_t config = {0};
    cudaLaunchAttribute attribute[1];
    attribute[0].id = cudaLaunchAttributeClusterDimension;
    attribute[0].val.clusterDim.x = BLOCKS;
    attribute[0].val.clusterDim.y = 1;
    attribute[0].val.clusterDim.z = 1;
    config.numAttrs = 1;
    config.attrs = attribute;
    config.dynamicSmemBytes = 64*1024 ;
    config.blockDim=64;
    config.gridDim=BLOCKS;
    int *dev_data_ptr;
    uint64_t * cycles_ptr,*dev_cycles_ptr;
    cycles_ptr=new uint64_t[BLOCKS*1024];
    cudaMalloc(&dev_data_ptr, sizeof(int)*BLOCKS*32*1024*1024);
    cudaMalloc(&dev_cycles_ptr, sizeof(uint64_t)*BLOCKS*1024)
    cudaEvent_t start_event, end_event;
    cudaEventCreate(&start_event);
    cudaEventCreate(&end_event);
    cudaEventRecord(start_event);
    auto launch_res = cudaLaunchKernelEx(&config, test_dsm, dev_data_ptr, dev_cycles_ptr, copy_length);
    if(launch_res !=0){
        std::cout<<"Launch Kernel failed: "<<launch_res<<" "<<cudaGetErrorString(launch_res)<<std::endl;
        return;
    }
    cudaDeviceSynchronize();
    cudaEventRecord(end_event);
    float time_ms;
    cudaEventElapsedTime(&time_ms,start_event,end_event);
    std::cout<<"Host view test time: "<<time_ms<<std::endl;
    if(cudaPeekAtLastError()!=0){
        auto err=cudaGetLastError();
        std::cout<<"CUDA Error: "<<err<<" "<<cudaGetErrorString(err)<<std::endl;
        
    }
    else{
        std::cout<<"Finished\n";
        
    }
    cudaEventDestroy(start_event);
    cudaEventDestroy(end_event);
    std::cout<<std::endl;
}

And the result is error code 1, "invalid argument". cudaDeviceProp shows cooperative_launch and cluster launch is available. Is the issue due to the dynamic shared memory configuration? But 64K should be under the limit of each block.

Those don’t match. From here:

If a kernel uses compile-time cluster size, the cluster size cannot be modified when launching the kernel.

If you define a static (i.e. compile-time) cluster dim as you have done:

You can use triple-chevron syntax to launch, which may be simpler than setting your own launch up.

See here for an example. As indicated there, if it were me, I would want to be using CUDA 12.3 or newer on H100.

After some modifications, now the program can be executed successfully. I am running the kernel below:

__global__ void test_load_store_bandwidth_DSM_FixLen_no_sync(int* data, uint64_t* output){
    extern __shared__ int dsmem[];
     cg::cluster_group cluster=cg::this_cluster();
    int block_id = cluster.block_rank(); 
    int temp1, temp2, temp3, temp4;
 
    int idx_end=copy_data_len/blockDim.x;
    uint64_t start_clock;
    int next_rank =(block_id+1)%BLOCKS;
    int *dst_smem = cluster.map_shared_rank(dsmem, next_rank);
    __syncthreads();
    start_clock=clock64();
    temp1=dst_smem[0];temp2=dst_smem[1];temp3=dst_smem[2];temp4=dst_smem[4];
 
    cluster.sync();

    uint64_t end_clock=clock64();
    output[block_id]=end_clock-start_clock;

    __syncthreads();
    data[0]=temp1+temp2+temp3+temp4;
}

The data parameter of the function is just a placement. I want to perform 4 load from the distributed shared memory of the other rank, and put the clock cycles spent into the output array. And the result is 900 to 1400. Is this result correct? I think 1000 cycles is almost the cost of loading 4 int from the global memory, the distributed shared memory must be much faster than that. Is there anything I got wrong again?

I’m not sure if it is correct or not. One of the things that I would do if I were concerned about such a measurement is the SASS analysis that I tried to train you on in your last question. I don’t see any evidence that you have done that here. I personally cannot be sure what the clock64() timestamps are measuring without doing that kind of analysis.

Regarding the comparison to loading from global memory, that might not be surprising for DSM. The behavior is somewhere between “ordinary” global behavior and “ordinary” shared behavior.

Also the code you have now posted won’t compile.

The cluster.sync() should be issued prior to any DSMEM accesses to ensure that the other CTAs have launched.

It is not clear if you are trying to measure

  1. warp dsmem read instruction issue latency
  2. warp dsmem read latency (requires a dependency before clock64())
  3. block dsmem read latency
  4. cluster dsmem read latency
#include <stdint.h>
#include <cooperative_groups.h>
namespace cg = cooperative_groups;

#define BLOCKS 8

__global__ void test_load_store_bandwidth_DSM_FixLen_no_sync(int* data, uint64_t* output, int copy_data_len){
    extern __shared__ int dsmem[];
     cg::cluster_group cluster=cg::this_cluster();
    int block_id = cluster.block_rank(); 
 
    int idx_end=copy_data_len/blockDim.x;

    int next_rank =(block_id+1)%BLOCKS;
    int *dst_smem = cluster.map_shared_rank(dsmem, next_rank);

    uint64_t time_start = 0;
    uint64_t time_end_warp_issue = 0;
    uint64_t time_end_warp_complete = 0;
    uint64_t time_end_block = 0;
    uint64_t time_end_cluster = 0;

    int value = 0;

    // Ensure all warps have completed initialization.
    __syncthreads();

    // Ensure all thread blocks in cluster have launched and initalized.
    cluster.sync();

    time_start = clock64();

    int temp0 = dst_smem[0];
    int temp1 = dst_smem[1];
    int temp2 = dst_smem[2];
    int temp3 = dst_smem[4];

    // TODO - check SASS as this may be moved. If a dependency is not added for the return value
    // the timestamp will only measure the cycles to issue the loads.
    time_end_warp_issue = clock64();

    value = temp0 + temp1 + temp2 + temp3;
    time_end_warp_complete = clock64();

    __syncthreads();

    time_end_block = clock64();

    cluster.sync();
    time_end_cluster = clock64();

    data[0] = value;
    output[block_id * 4 + 0] = time_end_warp_issue - time_start;
    output[block_id * 4 + 1] = time_end_warp_complete - time_start;
    output[block_id * 4 + 2] = time_end_block - time_start;
    output[block_id * 4 + 3] = time_end_cluster - time_start;
}

If you copy into godbolt.org with NVCC 12.5.1 with arguments -arch=compute_90 -code=sm_90 you can view the SASS and it does have the CS2R Rdst, SR_CLOCKLO in the correct locations.

 CS2R R2, SR_CLOCKLO                <-\--\  time_start
 LD.E R8, desc[UR4][R4.64]            |  |
 LD.E R9, desc[UR4][R4.64+0x4]        |  |
 LD.E R10, desc[UR4][R4.64+0x8]       |  |
 LD.E R11, desc[UR4][R4.64+0x10]      |  |
 CS2R R6, SR_CLOCKLO                <-/  |  // time_end_warp_issue
 IADD3 R8, R10, R9, R8                   |
 IMAD.IADD R19, R11, 0x1, R8             |
 CS2R R8, SR_CLOCKLO                <----/  // time_end_warp_complete (read latency)

The time_end_block and time_end_cluster are also in correct locations.

1 Like