NVSHMEM issues with synchronization

I wrote a code::

include <cuda.h>
include <nvshmem.h>
include <nvshmemx.h>
include <mpi.h>
include
include <cuda_runtime.h>
include <device_launch_parameters.h>

int rank, ndevices, procs;
// int mype_node, msg;
cudaStream_t stream;

int *cpu_data_array;
int *cpu_data_array_check;
int *gpu_data_array;

constexpr int Nx{8};
constexpr int Ny{8};
constexpr int Nz{4};

void print_real_data_cpu(int *data, int x, int y, int z)
{
for (size_t i = 0; i < x; i++)
{
std::cout << “\n\n”;
for (size_t j = 0; j < y; j++)
{
std::cout << std::endl;
for (size_t k = 0; k < z; k++)
{
std::cout << " " << data[(i * y * z) + (j * z) + k];
}
}
}
}

global void reality_condition(int *data, int procs, int rank)
{
int i = threadIdx.x + (blockIdx.x * blockDim.x);
int y_current = (i % (Ny / procs));
int y_global = (rank * (Ny / procs)) + y_current;
int x = (i / (Ny / procs)) % Nx;
// __syncthreads();

if (((y_global != 0) && (x != 0) && (x < (Nx / 2)) && (y_global < (Ny / 2))))
{
    // int temp1 = Nx - x;
    int temp2 = Ny - y_global;

    int rank_to_send = temp2 / (Ny / procs);
    int y_at_that_rank = temp2 % (Ny / procs);
    int val = data[(x * (Ny / procs) * Nz) + (y_current * Nz) + 0];
    nvshmem_int_p(&(data[((Nx - x) * (Ny / procs) * Nz) + (y_at_that_rank * Nz) + 0]), val, rank_to_send);

}
// __syncthreads();

}

int main()
{
nvshmemx_init_attr_t attr;
MPI_Comm comm = MPI_COMM_WORLD;
attr.mpi_comm = &comm;

MPI_Init(nullptr, nullptr);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &procs);

cudaSetDevice(rank % 2);
cudaStreamCreate(&stream);

// initialization of nvshemms with MPI
nvshmemx_init_attr(NVSHMEMX_INIT_WITH_MPI_COMM, &attr);

// initialization of arrays
cpu_data_array = (int *)malloc(sizeof(int) * Nx * (Ny / procs) * Nz);
cpu_data_array_check = (int *)malloc(sizeof(int) * Nx * (Ny / procs) * Nz);
gpu_data_array = (int *)nvshmem_malloc(sizeof(int) * Nx * (Ny / procs) * Nz);

for (size_t i = 0; i < Nx; i++)
{
    for (size_t j = 0, y = rank * (Ny / procs); j < (Ny / procs), y < ((rank + 1) * (Ny / procs)); j++, y++)
    {
        for (size_t k = 0; k < Nz; k++)
        {
            cpu_data_array[(i * (Ny / procs) * Nz) + (j * Nz) + k] = (i * Ny * Nz) + (y * Nz) + k;
        }
    }
}
// if (rank == 1)
//     print_real_data_cpu(cpu_data_array, Nx, Ny / procs, Nz);

cudaMemcpyAsync(gpu_data_array, cpu_data_array, sizeof(int) * Nx * (Ny / procs) * Nz, cudaMemcpyHostToDevice, stream);
reality_condition<<<Nx, (Ny / procs), 0, stream>>>(gpu_data_array, procs, rank);
nvshmemx_barrier_all_on_stream(stream);
cudaDeviceSynchronize();

cudaMemcpy(cpu_data_array_check, gpu_data_array, sizeof(int) * Nx * (Ny / procs) * Nz, cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
nvshmem_free(gpu_data_array);
nvshmem_finalize();

if (rank == 1)
    print_real_data_cpu(cpu_data_array_check, Nx, Ny / procs, Nz);
MPI_Finalize();

return 0;

}

when i ran it . sometimes it gives right output sometimes it gives wrong output. I don’t know why??
Can someone tell me , since i have taken care of all synchronization in code as you can see.

when posting code on these forums, please format the code correctly. A simple method is to edit your post by clicking on the pencil icon below your post, then select the code, then click the </> button at the top of the edit window, then save your changes.

Please do that now.

I also suggest providing the actual or problem output, and the expected output. Finally, it’s often a good idea to indicate which CUDA version, which NVSHMEM version, what the compile command is, and what GPU you are running on.

Sorry i was not able to post the code correctly :::
Here is the corrected code ::::

#include <cuda.h>
#include <nvshmem.h>
#include <nvshmemx.h>
#include <mpi.h>
#include <iostream>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>

int rank, ndevices, procs;
// int mype_node, msg;
cudaStream_t stream;

int *cpu_data_array;
int *cpu_data_array_check;
int *gpu_data_array;

constexpr int Nx{8};
constexpr int Ny{8};
constexpr int Nz{4};

void print_real_data_cpu(int *data, int x, int y, int z)
{
    for (size_t i = 0; i < x; i++)
    {
        std::cout << "\n\n";
        for (size_t j = 0; j < y; j++)
        {
            std::cout << std::endl;
            for (size_t k = 0; k < z; k++)
            {
                std::cout << "  " << data[(i * y * z) + (j * z) + k];
            }
        }
    }
}

__global__ void reality_condition(int *data, int procs, int rank)
{
    int i = threadIdx.x + (blockIdx.x * blockDim.x);
    int y_current = (i % (Ny / procs));
    int y_global = (rank * (Ny / procs)) + y_current;
    int x = (i / (Ny / procs)) % Nx;
    // __syncthreads();

    if (((y_global != 0) && (x != 0) && (x < (Nx / 2)) && (y_global < (Ny / 2))))
    {
        // int temp1 = Nx - x;
        int temp2 = Ny - y_global;

        int rank_to_send = temp2 / (Ny / procs);
        int y_at_that_rank = temp2 % (Ny / procs);
        int val = -data[(x * (Ny / procs) * Nz) + (y_current * Nz) + 0];
        nvshmem_int_p(&(data[((Nx - x) * (Ny / procs) * Nz) + (y_at_that_rank * Nz) + 0]), val, rank_to_send);

    }
    // __syncthreads();
}

int main()
{
    nvshmemx_init_attr_t attr;
    MPI_Comm comm = MPI_COMM_WORLD;
    attr.mpi_comm = &comm;

    MPI_Init(nullptr, nullptr);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &procs);

    cudaSetDevice(rank % 2);
    cudaStreamCreate(&stream);

    // initialization of nvshemms with MPI
    nvshmemx_init_attr(NVSHMEMX_INIT_WITH_MPI_COMM, &attr);

    // initialization of arrays
    cpu_data_array = (int *)malloc(sizeof(int) * Nx * (Ny / procs) * Nz);
    cpu_data_array_check = (int *)malloc(sizeof(int) * Nx * (Ny / procs) * Nz);
    gpu_data_array = (int *)nvshmem_malloc(sizeof(int) * Nx * (Ny / procs) * Nz);

    for (size_t i = 0; i < Nx; i++)
    {
        for (size_t j = 0, y = rank * (Ny / procs); j < (Ny / procs), y < ((rank + 1) * (Ny / procs)); j++, y++)
        {
            for (size_t k = 0; k < Nz; k++)
            {
                cpu_data_array[(i * (Ny / procs) * Nz) + (j * Nz) + k] = (i * Ny * Nz) + (y * Nz) + k;
            }
        }
    }
    // if (rank == 1)
    //     print_real_data_cpu(cpu_data_array, Nx, Ny / procs, Nz);

    cudaMemcpyAsync(gpu_data_array, cpu_data_array, sizeof(int) * Nx * (Ny / procs) * Nz, cudaMemcpyHostToDevice, stream);
    reality_condition<<<Nx, (Ny / procs), 0, stream>>>(gpu_data_array, procs, rank);
    nvshmemx_barrier_all_on_stream(stream);
    cudaDeviceSynchronize();

    cudaMemcpy(cpu_data_array_check, gpu_data_array, sizeof(int) * Nx * (Ny / procs) * Nz, cudaMemcpyDeviceToHost);
    cudaDeviceSynchronize();
    nvshmem_free(gpu_data_array);
    nvshmem_finalize();

    if (rank == 1)
        print_real_data_cpu(cpu_data_array_check, Nx, Ny / procs, Nz);
    MPI_Finalize();

    return 0;
}

Sometimes it gives correct output and sometimes it gives wrong output. I don’t know whats the issue here.

I compiled it with command as ::

nvcc -std=c++14 -arch=sm_80 -rdc=true reality.cu -I /home/manver/hpc_sdk/Linux_x86_64/22.3/comm_libs/nvshmem/include,/usr/local/include -L /home/manver/hpc_sdk/Linux_x86_64/22.3/comm_libs/nvshmem/lib,/usr/local/lib -lcuda -lcudart -lnvshmem -lmpi -lnvidia-ml -o reality

correct output is :::

 16  17  18  19
  20  21  22  23
  24  25  26  27
  28  29  30  31


  48  49  50  51
  52  53  54  55
  56  57  58  59
  60  61  62  63


  80  81  82  83
  84  85  86  87
  88  89  90  91
  92  93  94  95


  112  113  114  115
  116  117  118  119
  120  121  122  123
  124  125  126  127


  144  145  146  147
  148  149  150  151
  152  153  154  155
  156  157  158  159


  176  177  178  179
  -108  181  182  183
  -104  185  186  187
  -100  189  190  191


  208  209  210  211
  -76  213  214  215
  -72  217  218  219
  -68  221  222  223


  240  241  242  243
  -44  245  246  247
  -40  249  250  251
  -36  253  254  255

Output i get sometimes is :::

16  17  18  19
  20  21  22  23
  24  25  26  27
  28  29  30  31


  48  49  50  51
  52  53  54  55
  56  57  58  59
  60  61  62  63


  80  81  82  83
  84  85  86  87
  88  89  90  91
  92  93  94  95


  112  113  114  115
  116  117  118  119
  120  121  122  123
  124  125  126  127


  144  145  146  147
  148  149  150  151
  152  153  154  155
  156  157  158  159


  176  177  178  179
  180  181  182  183
  184  185  186  187
  188  189  190  191


  208  209  210  211
  212  213  214  215
  216  217  218  219
  220  221  222  223


  240  241  242  243
  244  245  246  247
  248  249  250  251
  252  253  254  255

I’m using cuda version 11.6.
I have ran this code on nvidia A100 and also tried to run it on TitanX Gpus, In both cases output is same

One issue I see with the code is that you are doing
cudaMemcpyAsync immediately followed by reality_condition kernel on the same stream.
This can lead to race condition as both are updating the same gpu data array.

That is, while cudaMemcpy is happening, a GPU can also start receiving the nvshmem_p data from other GPUs. Therefore, before the reality_condition kernel you would need a nvshmem_barrier_on_stream call.

Thanks , Issue is now resolved.

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.