The problems
when I run nvshmem I got the problem:
then I found if I run cudaFree(0)
before nvshmem_init
, then I got problem with nvshmem_barrier_all
.
calling CUDA runtime API such as cudaMalloc
/cudaFuncSetAttribute
also makes the problem.
move cudaFree(0)
right after nvshmem_init
helps solve the problem.
Details
nvshmem version: 3.2.5
build nvshmem with :
export NVSHMEM_IBGDA_SUPPORT=0
export NVSHMEM_IBGDA_SUPPORT_GPUMEM_ONLY=0
export NVSHMEM_IBDEVX_SUPPORT=0
export NVSHMEM_IBRC_SUPPORT=1
export NVSHMEM_LIBFABRIC_SUPPORT=0
export NVSHMEM_MPI_SUPPORT=1
export NVSHMEM_USE_GDRCOPY=0
export NVSHMEM_ENABLE_ALL_DEVICE_INLINING=1
cmake .. \
-DCMAKE_EXPORT_COMPILE_COMMANDS=1 \
-DCMAKE_CUDA_ARCHITECTURES=90 \
-DNVSHMEM_BUILD_TESTS=ON \
-DNVSHMEM_BUILD_EXAMPLES=OFF \
-DNVSHMEM_BUILD_PACKAGES=OFF
The sample code:
#include <cstdlib>
#include <cuda_runtime.h>
#include <nvshmem.h>
#include <nvshmemx.h>
#undef CUDA_CHECK
#define CUDA_CHECK(stmt) \
do { \
cudaError_t result = (stmt); \
if (cudaSuccess != result) { \
fprintf(stderr, "[%s:%d] cuda failed with %s \n", __FILE__, __LINE__, \
cudaGetErrorString(result)); \
exit(-1); \
} \
} while (0)
using namespace std;
int main(int c, char *v[]) {
CUDA_CHECK(cudaFree(0)); // ** run this first cause nvshmem_barrier_all failed. comment this line, this program runs well. **
int mype, npes, mype_node;
fprintf(stderr, "nvshmem_init...\n");
nvshmem_init();
fprintf(stderr, "nvshmem_init done...\n");
mype = nvshmem_my_pe();
npes = nvshmem_n_pes();
mype_node = nvshmem_team_my_pe(NVSHMEMX_TEAM_NODE);
CUDA_CHECK(cudaSetDevice(mype_node));
nvshmem_barrier_all();
fprintf(stderr, "nvshmem_barrier_all done...\n");
CUDA_CHECK(cudaDeviceSynchronize());
CUDA_CHECK(cudaDeviceSynchronize());
nvshmem_finalize();
return 0;
}
compiles with
NVSHMEM_HOME=/path/to/nvshmem
nvcc -std=c++17 -O3 -arch=sm_90 --compiler-options="-O2 -pipe -march=native -Wall -fopenmp" -Xcompiler -rdynamic --generate-line-info -I${NVSHMEM_HOME}/include -L${NVSHMEM_HOME}/lib -lnvshmem -o main-nvshmem main-nvshmem.cu -L/opt/cuda/lib64 -lcublas -lnvidia-ml
runs with this command
NVSHMEM_DEBUG=INFO NVSHMEM_DEBUG_SUBSYS=ALL NVSHMEM_DISABLE_CUDA_VMM=1 nvshmrun -ppn 2 -n 2 main-nvshmem
i got the error:
runs with this command
NVSHMEM_DEBUG=INFO NVSHMEM_DEBUG_SUBSYS=ALL NVSHMEM_DISABLE_CUDA_VMM=0 nvshmrun -ppn 2 -n 2 main-nvshmem
then it fails in nvshmem_init (on H800)