NVSHMEM issues with synchronization

priverma334 · June 12, 2023, 6:24pm

I wrote a code::

include <cuda.h>
include <nvshmem.h>
include <nvshmemx.h>
include <mpi.h>
include
include <cuda_runtime.h>
include <device_launch_parameters.h>

int rank, ndevices, procs;
// int mype_node, msg;
cudaStream_t stream;

int *cpu_data_array;
int *cpu_data_array_check;
int *gpu_data_array;

constexpr int Nx{8};
constexpr int Ny{8};
constexpr int Nz{4};

void print_real_data_cpu(int *data, int x, int y, int z)
{
for (size_t i = 0; i < x; i++)
{
std::cout << “\n\n”;
for (size_t j = 0; j < y; j++)
{
std::cout << std::endl;
for (size_t k = 0; k < z; k++)
{
std::cout << " " << data[(i * y * z) + (j * z) + k];
}
}
}
}

global void reality_condition(int *data, int procs, int rank)
{
int i = threadIdx.x + (blockIdx.x * blockDim.x);
int y_current = (i % (Ny / procs));
int y_global = (rank * (Ny / procs)) + y_current;
int x = (i / (Ny / procs)) % Nx;
// __syncthreads();
if (((y_global != 0) && (x != 0) && (x < (Nx / 2)) && (y_global < (Ny / 2))))
{
    // int temp1 = Nx - x;
    int temp2 = Ny - y_global;

    int rank_to_send = temp2 / (Ny / procs);
    int y_at_that_rank = temp2 % (Ny / procs);
    int val = data[(x * (Ny / procs) * Nz) + (y_current * Nz) + 0];
    nvshmem_int_p(&(data[((Nx - x) * (Ny / procs) * Nz) + (y_at_that_rank * Nz) + 0]), val, rank_to_send);

}
// __syncthreads();
}

int main()
{
nvshmemx_init_attr_t attr;
MPI_Comm comm = MPI_COMM_WORLD;
attr.mpi_comm = &comm;
MPI_Init(nullptr, nullptr);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &procs);

cudaSetDevice(rank % 2);
cudaStreamCreate(&stream);

// initialization of nvshemms with MPI
nvshmemx_init_attr(NVSHMEMX_INIT_WITH_MPI_COMM, &attr);

// initialization of arrays
cpu_data_array = (int *)malloc(sizeof(int) * Nx * (Ny / procs) * Nz);
cpu_data_array_check = (int *)malloc(sizeof(int) * Nx * (Ny / procs) * Nz);
gpu_data_array = (int *)nvshmem_malloc(sizeof(int) * Nx * (Ny / procs) * Nz);

for (size_t i = 0; i < Nx; i++)
{
    for (size_t j = 0, y = rank * (Ny / procs); j < (Ny / procs), y < ((rank + 1) * (Ny / procs)); j++, y++)
    {
        for (size_t k = 0; k < Nz; k++)
        {
            cpu_data_array[(i * (Ny / procs) * Nz) + (j * Nz) + k] = (i * Ny * Nz) + (y * Nz) + k;
        }
    }
}
// if (rank == 1)
//     print_real_data_cpu(cpu_data_array, Nx, Ny / procs, Nz);

cudaMemcpyAsync(gpu_data_array, cpu_data_array, sizeof(int) * Nx * (Ny / procs) * Nz, cudaMemcpyHostToDevice, stream);
reality_condition<<<Nx, (Ny / procs), 0, stream>>>(gpu_data_array, procs, rank);
nvshmemx_barrier_all_on_stream(stream);
cudaDeviceSynchronize();

cudaMemcpy(cpu_data_array_check, gpu_data_array, sizeof(int) * Nx * (Ny / procs) * Nz, cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
nvshmem_free(gpu_data_array);
nvshmem_finalize();

if (rank == 1)
    print_real_data_cpu(cpu_data_array_check, Nx, Ny / procs, Nz);
MPI_Finalize();

return 0;
}

when i ran it . sometimes it gives right output sometimes it gives wrong output. I don’t know why??
Can someone tell me , since i have taken care of all synchronization in code as you can see.

Robert_Crovella · June 12, 2023, 6:38pm

when posting code on these forums, please format the code correctly. A simple method is to edit your post by clicking on the pencil icon below your post, then select the code, then click the </> button at the top of the edit window, then save your changes.

Please do that now.

I also suggest providing the actual or problem output, and the expected output. Finally, it’s often a good idea to indicate which CUDA version, which NVSHMEM version, what the compile command is, and what GPU you are running on.

priverma334 · June 15, 2023, 9:49am

Sorry i was not able to post the code correctly :::
Here is the corrected code ::::

#include <cuda.h>
#include <nvshmem.h>
#include <nvshmemx.h>
#include <mpi.h>
#include <iostream>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>

int rank, ndevices, procs;
// int mype_node, msg;
cudaStream_t stream;

int *cpu_data_array;
int *cpu_data_array_check;
int *gpu_data_array;

constexpr int Nx{8};
constexpr int Ny{8};
constexpr int Nz{4};

void print_real_data_cpu(int *data, int x, int y, int z)
{
    for (size_t i = 0; i < x; i++)
    {
        std::cout << "\n\n";
        for (size_t j = 0; j < y; j++)
        {
            std::cout << std::endl;
            for (size_t k = 0; k < z; k++)
            {
                std::cout << "  " << data[(i * y * z) + (j * z) + k];
            }
        }
    }
}

__global__ void reality_condition(int *data, int procs, int rank)
{
    int i = threadIdx.x + (blockIdx.x * blockDim.x);
    int y_current = (i % (Ny / procs));
    int y_global = (rank * (Ny / procs)) + y_current;
    int x = (i / (Ny / procs)) % Nx;
    // __syncthreads();

    if (((y_global != 0) && (x != 0) && (x < (Nx / 2)) && (y_global < (Ny / 2))))
    {
        // int temp1 = Nx - x;
        int temp2 = Ny - y_global;

        int rank_to_send = temp2 / (Ny / procs);
        int y_at_that_rank = temp2 % (Ny / procs);
        int val = -data[(x * (Ny / procs) * Nz) + (y_current * Nz) + 0];
        nvshmem_int_p(&(data[((Nx - x) * (Ny / procs) * Nz) + (y_at_that_rank * Nz) + 0]), val, rank_to_send);

    }
    // __syncthreads();
}

int main()
{
    nvshmemx_init_attr_t attr;
    MPI_Comm comm = MPI_COMM_WORLD;
    attr.mpi_comm = &comm;

    MPI_Init(nullptr, nullptr);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &procs);

    cudaSetDevice(rank % 2);
    cudaStreamCreate(&stream);

    // initialization of nvshemms with MPI
    nvshmemx_init_attr(NVSHMEMX_INIT_WITH_MPI_COMM, &attr);

    // initialization of arrays
    cpu_data_array = (int *)malloc(sizeof(int) * Nx * (Ny / procs) * Nz);
    cpu_data_array_check = (int *)malloc(sizeof(int) * Nx * (Ny / procs) * Nz);
    gpu_data_array = (int *)nvshmem_malloc(sizeof(int) * Nx * (Ny / procs) * Nz);

    for (size_t i = 0; i < Nx; i++)
    {
        for (size_t j = 0, y = rank * (Ny / procs); j < (Ny / procs), y < ((rank + 1) * (Ny / procs)); j++, y++)
        {
            for (size_t k = 0; k < Nz; k++)
            {
                cpu_data_array[(i * (Ny / procs) * Nz) + (j * Nz) + k] = (i * Ny * Nz) + (y * Nz) + k;
            }
        }
    }
    // if (rank == 1)
    //     print_real_data_cpu(cpu_data_array, Nx, Ny / procs, Nz);

    cudaMemcpyAsync(gpu_data_array, cpu_data_array, sizeof(int) * Nx * (Ny / procs) * Nz, cudaMemcpyHostToDevice, stream);
    reality_condition<<<Nx, (Ny / procs), 0, stream>>>(gpu_data_array, procs, rank);
    nvshmemx_barrier_all_on_stream(stream);
    cudaDeviceSynchronize();

    cudaMemcpy(cpu_data_array_check, gpu_data_array, sizeof(int) * Nx * (Ny / procs) * Nz, cudaMemcpyDeviceToHost);
    cudaDeviceSynchronize();
    nvshmem_free(gpu_data_array);
    nvshmem_finalize();

    if (rank == 1)
        print_real_data_cpu(cpu_data_array_check, Nx, Ny / procs, Nz);
    MPI_Finalize();

    return 0;
}

Sometimes it gives correct output and sometimes it gives wrong output. I don’t know whats the issue here.

I compiled it with command as ::

nvcc -std=c++14 -arch=sm_80 -rdc=true reality.cu -I /home/manver/hpc_sdk/Linux_x86_64/22.3/comm_libs/nvshmem/include,/usr/local/include -L /home/manver/hpc_sdk/Linux_x86_64/22.3/comm_libs/nvshmem/lib,/usr/local/lib -lcuda -lcudart -lnvshmem -lmpi -lnvidia-ml -o reality

correct output is :::

 16  17  18  19
  20  21  22  23
  24  25  26  27
  28  29  30  31


  48  49  50  51
  52  53  54  55
  56  57  58  59
  60  61  62  63


  80  81  82  83
  84  85  86  87
  88  89  90  91
  92  93  94  95


  112  113  114  115
  116  117  118  119
  120  121  122  123
  124  125  126  127


  144  145  146  147
  148  149  150  151
  152  153  154  155
  156  157  158  159


  176  177  178  179
  -108  181  182  183
  -104  185  186  187
  -100  189  190  191


  208  209  210  211
  -76  213  214  215
  -72  217  218  219
  -68  221  222  223


  240  241  242  243
  -44  245  246  247
  -40  249  250  251
  -36  253  254  255

Output i get sometimes is :::

16  17  18  19
  20  21  22  23
  24  25  26  27
  28  29  30  31


  48  49  50  51
  52  53  54  55
  56  57  58  59
  60  61  62  63


  80  81  82  83
  84  85  86  87
  88  89  90  91
  92  93  94  95


  112  113  114  115
  116  117  118  119
  120  121  122  123
  124  125  126  127


  144  145  146  147
  148  149  150  151
  152  153  154  155
  156  157  158  159


  176  177  178  179
  180  181  182  183
  184  185  186  187
  188  189  190  191


  208  209  210  211
  212  213  214  215
  216  217  218  219
  220  221  222  223


  240  241  242  243
  244  245  246  247
  248  249  250  251
  252  253  254  255

I’m using cuda version 11.6.
I have ran this code on nvidia A100 and also tried to run it on TitanX Gpus, In both cases output is same

alanger · June 28, 2023, 12:05pm

One issue I see with the code is that you are doing
cudaMemcpyAsync immediately followed by reality_condition kernel on the same stream.
This can lead to race condition as both are updating the same gpu data array.

That is, while cudaMemcpy is happening, a GPU can also start receiving the nvshmem_p data from other GPUs. Therefore, before the reality_condition kernel you would need a nvshmem_barrier_on_stream call.

priverma334 · July 18, 2023, 11:02am

Thanks , Issue is now resolved.

system · August 1, 2023, 11:03am

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.

Topic		Replies	Views
NVSHMEM isuues with nvshmem_TYPENAME_put GPU-Accelerated Libraries nvshmem	6	697	November 21, 2023
Sharing GPU with others makes me can't fetch remote data? GPU-Accelerated Libraries nvshmem	5	120	August 19, 2025
NVSHMEM runtime error GPU-Accelerated Libraries nvshmem	11	2052	August 16, 2022
Non-zero status: 22 ibv_modify_qp failed When running nvshmem example on more than one GPU GPU-Accelerated Libraries nvshmem	2	133	August 8, 2025
Potential NVSHMEM allocated memory performance issue GPU-Accelerated Libraries nvshmem	19	1649	May 10, 2024
NVSHMEM setup GPU-Accelerated Libraries gpu-computing	0	145	October 6, 2024
Nvshmem with MPS with numGpu + 1 MPI process GPU-Accelerated Libraries nvshmem	0	83	April 3, 2025
NVSHMEM Installation undefined reference to `__sync_synchronize' GPU-Accelerated Libraries nvshmem	2	346	June 13, 2024
NVSHMEM on multi-node GPUs failed . My gpu is A5000 GPU-Accelerated Libraries nvshmem	5	1165	April 1, 2024
Error when running NVSHMEM perftest GPU-Accelerated Libraries nvshmem	3	473	January 16, 2025

NVSHMEM issues with synchronization

Related topics