Thread block clusters and distributed shared memory not working as intended

I have written a simple CUDA program to perform array reduction using thread block clusters and distributed shared memory. I am compiling it with CUDA 12.0 and running on a hopper GPU. Below is the code I use:

#include <stdio.h>
#include <cooperative_groups.h>

#define CLUSTER_SIZE 4
#define BLOCK_SIZE 32

namespace cg = cooperative_groups;

__global__ void __cluster_dims__(CLUSTER_SIZE, 1, 1) 
cluster_reduce_sum(int n, float *arr, float *sum) 
{
    __shared__ float shared_mem[BLOCK_SIZE];
    __shared__ float cluster_sum;

    cluster_sum = 0.0f;

    cg::cluster_group cluster = cg::this_cluster();
    unsigned int cluster_block_rank = cluster.block_rank();
    unsigned int cluster_size = cluster.dim_blocks().x;
    
    const int idx = blockIdx.x * blockDim.x + threadIdx.x;


    shared_mem[threadIdx.x] = 0.0f;
    if (idx < n) {
        shared_mem[threadIdx.x] = arr[idx];
    }

    __syncthreads();

    for (int offset = BLOCK_SIZE / 2; offset; offset /= 2) {
		if (threadIdx.x < offset) {
			shared_mem[threadIdx.x] += shared_mem[threadIdx.x + offset];
		}
		__syncthreads();
	}

    cluster.sync();

    if (threadIdx.x == 0) {
		atomicAdd(cluster.map_shared_rank(&cluster_sum, 0), shared_mem[0]);
    }

    cluster.sync();

    if (threadIdx.x == 0 && cluster_block_rank == 0) {
		atomicAdd(sum, cluster_sum);
    }

    cluster.sync();
}

int main(int argc, char* argv[]) {
    int n = 128;

    if (argc > 1) {
        n = atoi(argv[1]);
    }

    float *h_arr, *h_sum, sum;
    h_arr = (float*) malloc(n * sizeof(float));
    h_sum = (float*) malloc(sizeof(float));

    int upper = 1024, lower = -1024;

    sum = 0.0f;
    for(int i = 0; i < n; i++)
    {
        h_arr[i] = 1;
        sum += h_arr[i];
    }

    float *d_arr, *d_sum;
    cudaMalloc(&d_arr, n * sizeof(float));
    cudaMalloc(&d_sum, sizeof(float));

    cudaMemcpy(d_arr, h_arr, n * sizeof(float), cudaMemcpyHostToDevice);

    int num_clusters = ceil ((float)n / (CLUSTER_SIZE * BLOCK_SIZE)) + 1;
    
    cluster_reduce_sum <<< CLUSTER_SIZE * num_clusters, BLOCK_SIZE >>> (n, d_arr, d_sum);

    cudaError_t error = cudaGetLastError();
    if(error != cudaSuccess)
    {
        printf("CUDA error: %s\n", cudaGetErrorString(error));
        return -1;
    }

    cudaMemcpy(h_sum, d_sum, sizeof(float), cudaMemcpyDeviceToHost);

    if (*h_sum != sum) {
        printf("Kernel incorrect: %f vs %f\n", sum, *h_sum);
    }

    return 0;
}

The outputs of the kernel and CPU do not match and as far as I know, there doesn’t seem to be any bugs in my code. Please help me find the issue with this code. Thanks!

What do you mean by “outputs do not match”? What are the actual values for cpu and gpu ?
Floating point math is not associative. Results may not be identical between a parallel version and a serial version.

Does it work when you use integers instead of floats?

Hi, thanks for your reply. I am creating an array of ones of size n=128 and computing the sum of this array on both CPU and GPU. On the CPU side, I get the correct sum value (sum = 128). But the result from the GPU is not 128, it is much less than that. The output of GPU is 64. I don’t think such a big difference can occur due to floating point errors and actual computation is a simple sum operation. Let me know if this clarifies your questions.

That’s right, that difference should not happen in your case. I do not have a hopper GPU available so I can only give some general suggestions.

Use in-kernel printf to check that per block and per cluster sums are as expected.

The output buffer d_sum is uninitialized before the kernel, but the kernel requires it to be 0 to produce the correct result.

Thanks for the suggestion. I added a print statement right before we add the per block sum (shared_mem[0]) to the cluster_sum. The sum per block seems to be as expected. But when I also print the final cluster sum right before we add it to the total sum, it is less than expected. So I guess something is something goes wrong when we accumulate the cluster sums from the local block sum. It could either be synchronisation issue (cluster.sync()), or issue with cluster.map_shared_rank.

I also added a cudaMemset to initialise the sum variable to 0, but it doesn’t make any change to the codes output. Let me know if you have any further ideas.

1 Like

Any update? I reproduce the issue. I add a printf function following
“atomicAdd(cluster.map_shared_rank(&cluster_sum, 0), shared_mem[0]);”
find atomicAdd on cluster.map_shared_rank(&cluster_sum, 0) is not as expected.

Thanks~

Please retest with a proper install of CUDA 12.3 (or later) on Hopper H100. There was evidently an issue/bug/defect with atomics on distributed shared memory for float and double type, prior to CUDA 12.3. As an alternative for verification of the above code on CUDA 12.0, the code will behave correctly if the data type is changed from float to e.g. int.

Example on CUDA 12.0:

$ cat t21.cu
#include <stdio.h>
#include <cooperative_groups.h>

#define CLUSTER_SIZE 4
#define BLOCK_SIZE 32

namespace cg = cooperative_groups;
using mt=int;
__global__ void __cluster_dims__(CLUSTER_SIZE, 1, 1)
cluster_reduce_sum(int n, mt *arr, mt *sum)
{
    __shared__ mt shared_mem[BLOCK_SIZE];
    __shared__ mt cluster_sum;

    cluster_sum = 0;

    cg::cluster_group cluster = cg::this_cluster();
    unsigned int cluster_block_rank = cluster.block_rank();
    unsigned int cluster_size = cluster.dim_blocks().x;

    const int idx = blockIdx.x * blockDim.x + threadIdx.x;


    shared_mem[threadIdx.x] = 0;
    if (idx < n) {
        shared_mem[threadIdx.x] = arr[idx];
    }

    __syncthreads();

    for (int offset = BLOCK_SIZE / 2; offset; offset /= 2) {
                if (threadIdx.x < offset) {
                        shared_mem[threadIdx.x] += shared_mem[threadIdx.x + offset];
                }
                __syncthreads();
        }

    cluster.sync();

    if (threadIdx.x == 0) {
                atomicAdd(cluster.map_shared_rank(&cluster_sum, 0), shared_mem[0]);
    }

    cluster.sync();

    if (threadIdx.x == 0 && cluster_block_rank == 0) {
                atomicAdd(sum, cluster_sum);
    }

    cluster.sync();
}

int main(int argc, char* argv[]) {
    int n = 128;

    if (argc > 1) {
        n = atoi(argv[1]);
    }

    mt *h_arr, *h_sum, sum;
    h_arr = (mt*) malloc(n * sizeof(float));
    h_sum = (mt*) malloc(sizeof(float));

    //int upper = 1024, lower = -1024;

    sum = 0;
    for(int i = 0; i < n; i++)
    {
        h_arr[i] = 1;
        sum += h_arr[i];
    }

    mt *d_arr, *d_sum;
    cudaMalloc(&d_arr, n * sizeof(mt));
    cudaMalloc(&d_sum, sizeof(mt));

    cudaMemcpy(d_arr, h_arr, n * sizeof(mt), cudaMemcpyHostToDevice);

    //int num_clusters = ceil ((float)n / (CLUSTER_SIZE * BLOCK_SIZE)) + 1;
    int num_clusters = 1;
    cluster_reduce_sum <<< CLUSTER_SIZE * num_clusters, BLOCK_SIZE >>> (n, d_arr, d_sum);

    cudaError_t error = cudaGetLastError();
    if(error != cudaSuccess)
    {
        printf("CUDA error: %s\n", cudaGetErrorString(error));
        return -1;
    }

    cudaMemcpy(h_sum, d_sum, sizeof(mt), cudaMemcpyDeviceToHost);

    if (*h_sum != sum) {
        printf("Kernel incorrect: %f vs %f\n", (float)sum, (float)(*h_sum));
    }

    return 0;
}
$ nvcc --version
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Fri_Jan__6_16:45:21_PST_2023
Cuda compilation tools, release 12.0, V12.0.140
Build cuda_12.0.r12.0/compiler.32267302_0
$ nvcc -o t21 t21.cu -arch=sm_90
$ compute-sanitizer ./t21
========= COMPUTE-SANITIZER
========= ERROR SUMMARY: 0 errors
$

(4112521)

(The num_clusters calculation was also incorrect, but that did not materially affect the observation.)

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.