Is there a any nccl packages with examples?

After done some cuda related programming, I wanted to delve into nccl programming but literature seems far and few. I am starting here but can not even get started with code snippets:

For example:
“ncclConfig_t config = NCCL_CONFIG_INITIALIZER;
config.blocking = 0;
config.minCTAs = 4;
config.maxCTAs = 16;
config.cgaClusterSize = 2;
config.netName = “Socket”;
CHECK(ncclCommInitRankConfig(&comm, nranks, id, rank, &config));
do {
CHECK(ncclCommGetAsyncError(comm, &state));
// Handle outside events, timeouts, progress, …
} while(state == ncclInProgress);”

I am unsure of what values to use for parameters for
ncclCommInitRankConfig()

I would like to see some examples like cuda rpm has plenty of.

I’m going to move this over the the accelerated libraries forum, where other NCCL questions are. NCCL has some examples codes listed here.

thz, I will try. I actually asked AI and came up pretty good examples. So installed nccl-dev, mpi libraries and compiled OK however getting some run time error. WIll provide details shortly.

it appears AI assistant pulled up exacly same example as the one in the link you put. Here is my compile and run scripts:

=============
set -x
FILENAME=ex-1-create-comm
EXT=cpp
COMPILER=/usr/lib64/openmpi/bin/mpicxx
COMPILER=nvcc
LOGDIR=log
mkdir -p $LOGDIR
rm -rf $FILENAME.out
$COMPILER
-L/usr/lib64 -L/usr/lib64/openmpi/lib/
-lmpi_cxx -lnccl -lmpi -I/usr/include/openmpi-x86_64/
$FILENAME.$EXT -o $FILENAME.out 2>&1 | tee $LOGDIR/$FILENAME.compile.log

if [[ -f $FILENAME.out ]] ; then
LD_LIBRARY_PATH=/usr/lib64/openmpi/lib/ OMPI_ALLOW_RUN_AS_ROOT=1 OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1
mpirun -np 4 ./$FILENAME.out -allow-run-as-root 2>&1 | tee $LODIR/$FILENAME.run.log
fi

log:

/usr/bin/ld: /tmp/tmpxft_000003b4_00000000-5_ex-1-create-comm.o: in function MPI::Intracomm::Intracomm()': ex-1-create-comm.cpp:(.text._ZN3MPI9IntracommC2Ev[_ZN3MPI9IntracommC5Ev]+0x14): undefined reference to MPI::Comm::Comm()’
/usr/bin/ld: /tmp/tmpxft_000003b4_00000000-5_ex-1-create-comm.o: in function MPI::Intracomm::Intracomm(ompi_communicator_t*)': ex-1-create-comm.cpp:(.text._ZN3MPI9IntracommC2EP19ompi_communicator_t[_ZN3MPI9IntracommC5EP19ompi_communicator_t]+0x19): undefined reference to MPI::Comm::Comm()’
/usr/bin/ld: /tmp/tmpxft_000003b4_00000000-5_ex-1-create-comm.o: in function MPI::Op::Init(void (*)(void const*, void*, int, MPI::Datatype const&), bool)': ex-1-create-comm.cpp:(.text._ZN3MPI2Op4InitEPFvPKvPviRKNS_8DatatypeEEb[_ZN3MPI2Op4InitEPFvPKvPviRKNS_8DatatypeEEb]+0x24): undefined reference to ompi_mpi_cxx_op_intercept’
/usr/bin/ld: /tmp/tmpxft_000003b4_00000000-5_ex-1-create-comm.o:(.rodata._ZTVN3MPI3WinE[_ZTVN3MPI3WinE]+0x48): undefined reference to MPI::Win::Free()' /usr/bin/ld: /tmp/tmpxft_000003b4_00000000-5_ex-1-create-comm.o:(.rodata._ZTVN3MPI8DatatypeE[_ZTVN3MPI8DatatypeE]+0x78): undefined reference to MPI::Datatype::Free()’
collect2: error: ld returned 1 exit status

That’s a compile-time error. You’re missing (proper/correct) MPI C++ bindings. Yes, I can see you have -lmpi_cxx, it may be an ordering issue, or a problem with your specific MPI. I won’t be able to sort it out for you. You should make sure you are using a properly built MPI such as the one in the HPC SDK. AFAIK nccl expects CUDA-aware MPI, with all that that implies.

turns out i already got it working. Now I am getting truly a either cuda or mpi issue after defeating few more runtime errors.
Here myrank is getting garbaged just during call to ncclCommInitRankConfig().

I instrumented with some debugging code and seeing myrank is getting updated through:
MPI_Comm_rank(MPI_COMM_WORLD, &myRank);

shortly after that, it is printing out correctly, 0 and 1 respectively for 2-GPU system:

++ tee log/ex-1-create-comm.run.log
myRank: 1.
nRanks: 2.
localRank: 1.
cudaSetDevice… with localRank: 1
myRank: 0.
nRanks: 2.
localRank: 0.
cudaSetDevice… with localRank: 0
MPI_Bcast…
MPI_Bcast…

Strange thing is untill then ncclCommInitRankConfig, there is no code that is updating the myrank but now just before calling this function, one of the myrank becomes 128:

ncclCommInitRankConfig: nRanks: 2, id: 0, myRank 0.
ncclCommInitRankConfig: nRanks: 2, id: 1, myRank 128.

One time, run it was getting something like 12342355 (-1) like number.

include <stdio.h>
include <cuda_runtime.h>
include <nccl.h>
include <mpi.h>

define CHECK_CUDA(call) {
cudaError_t status = call;
if (status != cudaSuccess) {
fprintf(stderr, “CUDA error: %s\n”, cudaGetErrorString(status));
exit(1);
}
}

define CHECK_NCCL(call) {
ncclResult_t status = call;
if (status != ncclSuccess) {
fprintf(stderr, “NCCL error: %s (%d)\n”, ncclGetErrorString(status), status);
exit(1);
}
}

int main(int argc, char* argv) {
int myRank, nRanks, localRank;
ncclUniqueId id;
ncclComm_t comm;
ncclConfig_t config = NCCL_CONFIG_INITIALIZER;
config.blocking = 0; // Non-blocking mode
ncclResult_t asyncState;

// Initialize MPI
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &myRank);
MPI_Comm_size(MPI_COMM_WORLD, &nRanks);


printf("myRank: %u.\n", myRank);
printf("nRanks: %u.\n", nRanks);
// Simple local rank calculation (for single GPU per node)
localRank = myRank % 2;  // Adjust based on GPUs per node

// Set CUDA device

printf("localRank: %u.\n", localRank);
printf("cudaSetDevice... with localRank: %u\n", localRank);
CHECK_CUDA(cudaSetDevice(localRank));

// Generate unique ID on rank 0 and broadcast
if (myRank == 0) {
    CHECK_NCCL(ncclGetUniqueId(&id));
}
printf("MPI_Bcast...\n");
MPI_Bcast((void*)&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD);

// Initialize communicator with config
printf("ncclCommInitRankConfig: nRanks: %u, id: %u, myRank %u.\n", nRanks, id, myRank);
CHECK_NCCL(ncclCommInitRankConfig(&comm, nRanks, id, myRank, &config));

// Wait for initialization to complete (poll async error)
do {
    CHECK_NCCL(ncclCommGetAsyncError(comm, &asyncState));
} while (asyncState == ncclInProgress);

if (asyncState != ncclSuccess) {
    fprintf(stderr, "Initialization failed on rank %d\n", myRank);
    CHECK_NCCL(ncclCommAbort(comm));
    MPI_Finalize();
    return 1;
}

// Example: All-reduce a simple buffer (size 1 for demo)
const int dataSize = 1;
float *sendBuff, *recvBuff;
CHECK_CUDA(cudaMalloc(&sendBuff, dataSize * sizeof(float)));
CHECK_CUDA(cudaMalloc(&recvBuff, dataSize * sizeof(float)));

// Initialize send buffer with rank ID
float initValue = static_cast<float>(myRank);
CHECK_CUDA(cudaMemcpy(sendBuff, &initValue, sizeof(float), cudaMemcpyHostToDevice));

printf("ncclAllReduce\n");

// Perform all-reduce (sum)
CHECK_NCCL(ncclAllReduce(sendBuff, recvBuff, dataSize, ncclFloat, ncclSum, comm, cudaStreamDefault));

// Wait for operation to complete
CHECK_CUDA(cudaStreamSynchronize(cudaStreamDefault));  // Or poll ncclCommGetAsyncError for non-blocking

// Verify result (should be sum of ranks, e.g., 0+1=1 for 2 ranks)
float result;
CHECK_CUDA(cudaMemcpy(&result, recvBuff, sizeof(float), cudaMemcpyDeviceToHost));
printf("Rank %d: All-reduce result = %.0f (expected %d)\n", myRank, result, (nRanks * (nRanks - 1)) / 2);

// Cleanup
CHECK_CUDA(cudaFree(sendBuff));
CHECK_CUDA(cudaFree(recvBuff));
CHECK_NCCL(ncclCommDestroy(comm));
MPI_Finalize();
return 0;

}