I just run it in docker container in Linux.
I write a demo to test:
#include <iostream>
#include <stdio.h>
#include <ctime>
#include <algorithm>
#include <mpi.h>
#include <nvshmem.h>
#include <nvshmemx.h>
#include <cublas_v2.h>
#include <cublas_api.h>
#include <cudaProfiler.h>
#include "cublas_utils.h"
using nidType = int;
using namespace std;
int main(int argc, char* argv[]){
cudaStream_t stream;
nvshmemx_init_attr_t attr;
int rank, nranks;
MPI_Comm mpi_comm = MPI_COMM_WORLD;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &nranks);
attr.mpi_comm = &mpi_comm;
// Set up NVSHMEM device.
nvshmemx_init_attr(NVSHMEMX_INIT_WITH_MPI_COMM, &attr);
int mype_node = nvshmem_team_my_pe(NVSHMEMX_TEAM_NODE);
cudaSetDevice(mype_node);
cudaStreamCreate(&stream);
int hidden_dim = 16;
int dim = 602;
int num_nodes = 10000000;
int ldx, ldw, ldout;
float *d_W, *d_out;
float alpha, beta;
cublasOperation_t transa, transb;
cublasHandle_t cublasH;
alpha = 1.0f;
beta = 0.0;
transa = CUBLAS_OP_N;
transb = CUBLAS_OP_N;
cublasH = NULL;
CUBLAS_CHECK(cublasCreate(&cublasH));
int max_dim = max(dim, hidden_dim);
//d_W = (float *) nvshmem_malloc (k * m * sizeof(float));
//dW:hidden_dim*dim
//d_out as d_B:dim*num_nodes
//d_out: hidden_dim * num_nodes
CUDA_CHECK(cudaMalloc((void **)&d_W, dim * hidden_dim * sizeof(float)));
CUDA_CHECK(cudaMemset(d_W, 0, dim * hidden_dim * sizeof(float)));
CUDA_CHECK(cudaMalloc((void **)&d_out, num_nodes * max_dim * sizeof(float)));
CUDA_CHECK(cudaMemset(d_out, 0, num_nodes * max_dim * sizeof(float)));
ldx = dim, ldw = hidden_dim, ldout = hidden_dim;
MPI_Barrier(MPI_COMM_WORLD);
CUBLAS_CHECK(cublasSgemm(cublasH, transa, transb, hidden_dim, num_nodes, dim,
&alpha, d_W, ldw, d_out, ldx, &beta,
d_out, ldout));
cudaFree(d_W);
cudaFree(d_out);
nvshmem_finalize();
MPI_Finalize();
return 0;
}
It is a demo to implement: d_out = d_W * d_out; like a forward in deep learning.
I change num_nodes to change d_out’s size.
when I set num_nodes as 2500,It can run correctly.
however, I change num_nodes to 10000000,It will raise :
cublas error 13 at /pipegnn/src/mgg_test.cu:65
terminate called after throwing an instance of 'std::runtime_error'
what(): cublas error
[powerleader:04808] *** Process received signal ***
[powerleader:04808] Signal: Aborted (6)
[powerleader:04808] Signal code: (-6)
[powerleader:04808] [ 0] /lib/x86_64-linux-gnu/libpthread.so.0(+0x14420)[0x7fdda43a1420]
[powerleader:04808] [ 1] /lib/x86_64-linux-gnu/libc.so.6(gsignal+0xcb)[0x7fdda3e8a00b]
[powerleader:04808] [ 2] /lib/x86_64-linux-gnu/libc.so.6(abort+0x12b)[0x7fdda3e69859]
[powerleader:04808] [ 3] /lib/x86_64-linux-gnu/libstdc++.so.6(+0x9e8d1)[0x7fdda42438d1]
[powerleader:04808] [ 4] /lib/x86_64-linux-gnu/libstdc++.so.6(+0xaa37c)[0x7fdda424f37c]
[powerleader:04808] [ 5] /lib/x86_64-linux-gnu/libstdc++.so.6(+0xaa3e7)[0x7fdda424f3e7]
[powerleader:04808] [ 6] /lib/x86_64-linux-gnu/libstdc++.so.6(+0xaa699)[0x7fdda424f699]
[powerleader:04808] [ 7] build1/mggTest(+0x1913d)[0x56186765a13d]
[powerleader:04808] [ 8] /lib/x86_64-linux-gnu/libc.so.6(__libc_start_main+0xf3)[0x7fdda3e6b083]
[powerleader:04808] [ 9] build1/mggTest(+0x1834e)[0x56186765934e]
[powerleader:04808] *** End of error message ***
cublas error 13 at /pipegnn/src/mgg_test.cu:65
terminate called after throwing an instance of 'std::runtime_error'
what(): cublas error
[powerleader:04807] *** Process received signal ***
[powerleader:04807] Signal: Aborted (6)
[powerleader:04807] Signal code: (-6)
[powerleader:04807] [ 0] /lib/x86_64-linux-gnu/libpthread.so.0(+0x14420)[0x7f286ae05420]
[powerleader:04807] [ 1] /lib/x86_64-linux-gnu/libc.so.6(gsignal+0xcb)[0x7f286a8ee00b]
[powerleader:04807] [ 2] /lib/x86_64-linux-gnu/libc.so.6(abort+0x12b)[0x7f286a8cd859]
[powerleader:04807] [ 3] /lib/x86_64-linux-gnu/libstdc++.so.6(+0x9e8d1)[0x7f286aca78d1]
[powerleader:04807] [ 4] /lib/x86_64-linux-gnu/libstdc++.so.6(+0xaa37c)[0x7f286acb337c]
[powerleader:04807] [ 5] /lib/x86_64-linux-gnu/libstdc++.so.6(+0xaa3e7)[0x7f286acb33e7]
[powerleader:04807] [ 6] /lib/x86_64-linux-gnu/libstdc++.so.6(+0xaa699)[0x7f286acb3699]
[powerleader:04807] [ 7] build1/mggTest(+0x1913d)[0x55881b62e13d]
[powerleader:04807] [ 8] /lib/x86_64-linux-gnu/libc.so.6(__libc_start_main+0xf3)[0x7f286a8cf083]
[powerleader:04807] [ 9] build1/mggTest(+0x1834e)[0x55881b62d34e]
[powerleader:04807] *** End of error message ***
--------------------------------------------------------------------------
Primary job terminated normally, but 1 process returned
a non-zero exit code. Per user-direction, the job has been aborted.
--------------------------------------------------------------------------
--------------------------------------------------------------------------
mpirun noticed that process rank 1 with PID 0 on node powerleader exited on signal 6 (Aborted)
I run it in A800 80G and A100 80G. and there has no other progam in GPUs.and I follow the nvidia-smi when running.I noticed that the max Gpu memory used just be 7449MB。
what is the reason?