I wrote a code::
include <cuda.h>
include <nvshmem.h>
include <nvshmemx.h>
include <mpi.h>
include
include <cuda_runtime.h>
include <device_launch_parameters.h>int rank, ndevices, procs;
// int mype_node, msg;
cudaStream_t stream;int *cpu_data_array;
int *cpu_data_array_check;
int *gpu_data_array;constexpr int Nx{8};
constexpr int Ny{8};
constexpr int Nz{4};void print_real_data_cpu(int *data, int x, int y, int z)
{
for (size_t i = 0; i < x; i++)
{
std::cout << “\n\n”;
for (size_t j = 0; j < y; j++)
{
std::cout << std::endl;
for (size_t k = 0; k < z; k++)
{
std::cout << " " << data[(i * y * z) + (j * z) + k];
}
}
}
}global void reality_condition(int *data, int procs, int rank)
{
int i = threadIdx.x + (blockIdx.x * blockDim.x);
int y_current = (i % (Ny / procs));
int y_global = (rank * (Ny / procs)) + y_current;
int x = (i / (Ny / procs)) % Nx;
// __syncthreads();if (((y_global != 0) && (x != 0) && (x < (Nx / 2)) && (y_global < (Ny / 2)))) { // int temp1 = Nx - x; int temp2 = Ny - y_global; int rank_to_send = temp2 / (Ny / procs); int y_at_that_rank = temp2 % (Ny / procs); int val = data[(x * (Ny / procs) * Nz) + (y_current * Nz) + 0]; nvshmem_int_p(&(data[((Nx - x) * (Ny / procs) * Nz) + (y_at_that_rank * Nz) + 0]), val, rank_to_send); } // __syncthreads();}
int main()
{
nvshmemx_init_attr_t attr;
MPI_Comm comm = MPI_COMM_WORLD;
attr.mpi_comm = &comm;MPI_Init(nullptr, nullptr); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &procs); cudaSetDevice(rank % 2); cudaStreamCreate(&stream); // initialization of nvshemms with MPI nvshmemx_init_attr(NVSHMEMX_INIT_WITH_MPI_COMM, &attr); // initialization of arrays cpu_data_array = (int *)malloc(sizeof(int) * Nx * (Ny / procs) * Nz); cpu_data_array_check = (int *)malloc(sizeof(int) * Nx * (Ny / procs) * Nz); gpu_data_array = (int *)nvshmem_malloc(sizeof(int) * Nx * (Ny / procs) * Nz); for (size_t i = 0; i < Nx; i++) { for (size_t j = 0, y = rank * (Ny / procs); j < (Ny / procs), y < ((rank + 1) * (Ny / procs)); j++, y++) { for (size_t k = 0; k < Nz; k++) { cpu_data_array[(i * (Ny / procs) * Nz) + (j * Nz) + k] = (i * Ny * Nz) + (y * Nz) + k; } } } // if (rank == 1) // print_real_data_cpu(cpu_data_array, Nx, Ny / procs, Nz); cudaMemcpyAsync(gpu_data_array, cpu_data_array, sizeof(int) * Nx * (Ny / procs) * Nz, cudaMemcpyHostToDevice, stream); reality_condition<<<Nx, (Ny / procs), 0, stream>>>(gpu_data_array, procs, rank); nvshmemx_barrier_all_on_stream(stream); cudaDeviceSynchronize(); cudaMemcpy(cpu_data_array_check, gpu_data_array, sizeof(int) * Nx * (Ny / procs) * Nz, cudaMemcpyDeviceToHost); cudaDeviceSynchronize(); nvshmem_free(gpu_data_array); nvshmem_finalize(); if (rank == 1) print_real_data_cpu(cpu_data_array_check, Nx, Ny / procs, Nz); MPI_Finalize(); return 0;}
when i ran it . sometimes it gives right output sometimes it gives wrong output. I don’t know why??
Can someone tell me , since i have taken care of all synchronization in code as you can see.