Absolutely… though it’s written to test the DGX-2 / HGX-2 - so I doubt many people out there have the hardware capable of debugging it! :(

The code is trying to find a faster way to run sparse matrix multiplies (in some scenarios) instead of csrmm / cscmm, by instead defaulting to ELLPACK (using padding for non-conforming matrices), which benefits hugely from coalesced memory reads - as each ‘row’ of non zeroes is a multiple of 32

```
#include <string>
#include <iostream>
#include <chrono>
#include <stdio.h>
#include <stdexcept>
#include <random>
#include <iterator>
#include <fstream>
#include <cublas_v2.h>
#ifndef CUDA
#define CUDA(expr) do { \
cudaError_t _e = (expr); \
if (_e == cudaSuccess) break; \
char errstr[128]; \
snprintf(errstr, 128, \
"%s(%d) CUDA Error(%d)\n", \
__FILE__, __LINE__, _e); \
throw std::runtime_error(errstr); \
} while (0)
#endif
#define CUSPARSE(expr) do { \
cusparseStatus_t _e = (expr); \
if (_e == CUSPARSE_STATUS_SUCCESS) break; \
char errstr[128]; \
snprintf(errstr, 128, \
"%s(%d) CUSPARSE Error(%d)\n", \
__FILE__, __LINE__, _e); \
throw std::runtime_error(errstr); \
} while (0)
#define CUBLAS(call) do { \
cublasStatus_t _e = (call); \
if (_e == CUBLAS_STATUS_SUCCESS) break; \
char errstr[128]; \
snprintf(errstr, 128, \
"%s(%d) CUBLAS Error(%d)\n", \
__FILE__, __LINE__, _e); \
throw std::runtime_error(errstr); \
} while (0)
#define DOM_FILE (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : strrchr(__FILE__, '\') ? strrchr(__FILE__, '\') + 1 : __FILE__)
#define MSG(msg,...) do {printf("%s(%d):" msg "\n",DOM_FILE,__LINE__,##__VA_ARGS__);} while(0)
#define NUM_SM 84
#define BLOCKS_PER_SM 16
#define BLOCKSIZE 128
//__launch_bounds__(BLOCKSIZE, BLOCKS_PER_SM)
__global__ void ELLgemm_multi_gpu(float ** results, int* index, float * vals, float * dense, int * maps, int * lengths, unsigned int nnz, unsigned int y)
{
const int tidx = threadIdx.x;
const int gpu = threadIdx.y; //this tells us what gpu to operate on
const int off1 = blockIdx.x;
const int off2 = gridDim.x;
float *result = results[gpu];
int length = lengths[gpu];
int map = maps[gpu];
index += nnz * (off1+map) + tidx;
vals += nnz * (off1+map) + tidx;
result += y * off1 + tidx;
dense += tidx;
//if (blockIdx.x > 0) return;
for (int read = off1; read<length; read += off2)
{
for (int b = 0; b<y; b += 32)
{
float calc = 0.0f;
for (int a = 0; a<nnz; a += 32)
{
int index_reg = index[a];
int vals_reg = vals[a];
for (int i = 0; i<32; i++)
{
int idx_use = __shfl_sync(0xFFFFFFFF, index_reg, i);
if (idx_use >= 0)
{
float vals_use = __shfl_sync(0xFFFFFFFF, vals_reg, i);
float dense_use = dense[y * idx_use + b];
calc += dense_use * vals_use;
}
//else if (tidx==0) printf("Error zero index, index is %d, gpu : %d, read : %d, b : %d, a is %d\n",index_reg, gpu,read,b,a);
}
//__syncwarp();
}
atomicAdd(result + b, calc);
}
index += nnz * off2;
vals += nnz * off2;
result += y * off2;
}
}
//__launch_bounds__(BLOCKSIZE, 8)
__global__ void ELLgemmT_multi_gpu(float * result, int* index, float * vals, float ** __restrict__ denses, int * __restrict__ maps, int * __restrict__ lengths, unsigned int nnz, unsigned int y) //, unsigned int nnz, unsigned int y
{
const int offz = blockIdx.z;
const int gdz = gridDim.z;
const int gpu = threadIdx.y; //this tells us what gpu to operate on
const float *dense = denses[gpu]; //gets the pointer to the dense array on the right gpu
int length = lengths[gpu]; //get the length for said array
int map = maps[gpu]; //gets the offset for array
//increment ELL to the corresponding chunk
{
int offset = 32 * blockIdx.x + threadIdx.x + (map + offz)*nnz;
index += offset;
vals += offset;
}
dense += 32 * blockIdx.y + threadIdx.x + offz*y;
result += 32 * blockIdx.y + threadIdx.x;
for (int i = offz; i<length; i += gdz)
{
//Idx_shared[tid] = index[0];
//Val_shared[tid] = vals[0];
float dense_reg = dense[0];
//__syncwarp();
int idx_reg = index[0];
int val_reg = vals[0];
for (int j = 0; j<32; j++)
{
//int write = Idx_shared[shared_offset + j];
int write = __shfl_sync(0xFFFFFFFF, idx_reg, j);
//if (write >= 0) atomicAdd(result + y * write, dense_reg * Val_shared[shared_offset + j]);
if (write >= 0) atomicAdd(result + y * write, dense_reg * __shfl_sync(0xFFFFFFFF, val_reg, j));
}
//__syncwarp();
index += gdz*nnz;
vals += gdz*nnz;
dense += gdz*y;
}
}
void ELLgemmT(float *result, int *index, float *vals, float **denses, int *maps, int *lengths, unsigned int nnz, unsigned int y, int ngpu, int totalblocks)
{
int blocksx = nnz / 32;
int blocksy = y / 32;
dim3 threadsPerBlock(32, ngpu);
dim3 numblocks(blocksx, blocksy, totalblocks / (blocksx*blocksy));
//dim3 numblocks(blocksx, blocksy, 2 * BLOCKS_PER_SM * NUM_SM / (blocksx*blocksy));
//int blocksreq = NUM_SM * 64 / ngpu;
//dim3 numblocks(blocksx, blocksy, blocksreq / blocksx / blocksy);
ELLgemmT_multi_gpu << <numblocks, threadsPerBlock >> >(result, index, vals, denses, maps, lengths, nnz, y);
}
void ELLgemmT(float *result, int *index, float *vals, float **denses, int *maps, int *lengths, unsigned int nnz, unsigned int y, int ngpu)
{
int blocksx = nnz / 32;
int blocksy = y / 32;
dim3 threadsPerBlock(32, ngpu);
dim3 numblocks(blocksx, blocksy, 2 * BLOCKS_PER_SM * NUM_SM / (blocksx*blocksy));
//int blocksreq = NUM_SM * 64 / ngpu;
//dim3 numblocks(blocksx, blocksy, blocksreq / blocksx / blocksy);
ELLgemmT_multi_gpu << <numblocks, threadsPerBlock >> >(result, index, vals, denses, maps, lengths, nnz, y);
}
void ELLgemm(float **results, int *index, float *vals, float *dense, int *maps, int *lengths, unsigned int nnz, unsigned int y, int ngpu, int totalblocks)
{
dim3 threadsPerBlock(32, ngpu);
int numblocks = totalblocks;
//int blocksreq = NUM_SM * 64 / ngpu;
//int numblocks = blocksreq;
ELLgemm_multi_gpu << <numblocks, threadsPerBlock >> >(results, index, vals, dense, maps, lengths, nnz, y);
}
void ELLgemm(float **results, int *index, float *vals, float *dense, int *maps, int *lengths, unsigned int nnz, unsigned int y, int ngpu)
{
dim3 threadsPerBlock(32, ngpu);
int numblocks = BLOCKS_PER_SM * NUM_SM;
//int blocksreq = NUM_SM * 64 / ngpu;
//int numblocks = blocksreq;
ELLgemm_multi_gpu << <numblocks, threadsPerBlock >> >(results, index, vals, dense, maps, lengths, nnz, y);
}
class Timer
{
public:
Timer() : beg_(clock_::now()) {}
void start() { beg_ = clock_::now(); }
double stop() const {
return std::chrono::duration_cast<second_>
(clock_::now() - beg_).count();
}
private:
typedef std::chrono::high_resolution_clock clock_;
typedef std::chrono::duration<double, std::ratio<1> > second_;
std::chrono::time_point<clock_> beg_;
};
int main(int argc, char* argv[])
{
try
{
if(argc!=2){
throw;
}
int ngpu = atoi(argv[1]);
Timer t;
int rows_per_gpu = 375000;//6e6 / ngpu;
int cols_per_gpu = 625000;//1e7 / ngpu;
int cols_per_gpu1 = 3125000;//5e7 / ngpu;
int nnz = 96;
int batch = 32;
int *map;
int *elems;
float **out_dvc;
float **out2_dvc;
CUDA(cudaMallocHost((void**)&map, ngpu * sizeof(int)));
CUDA(cudaMallocHost((void**)&elems, ngpu * sizeof(int)));
CUDA(cudaMallocHost((void**)&out_dvc,ngpu * sizeof(float**)));
CUDA(cudaMallocHost((void**)&out2_dvc, ngpu * sizeof(float**)));
int **colIdx_dvc = (int **)malloc(ngpu*sizeof(int**));
int **colIdx1_dvc = (int **)malloc(ngpu*sizeof(int**));
float **Val_dvc = (float **)malloc(ngpu * sizeof(float**));
float **data_dvc = (float **)malloc(ngpu * sizeof(float**));
bool runp2pmode = true;
for (int i = 0; i < ngpu; i++)
{
CUDA(cudaSetDevice(i));
map[i] = rows_per_gpu * i;
elems[i] = rows_per_gpu;
for (int j = 0; j < ngpu; j++)
{
if (i != j)
{
int access;
CUDA(cudaDeviceCanAccessPeer(&access, j, i));
runp2pmode = runp2pmode & (bool)access;
if (access)
{
CUDA(cudaDeviceEnablePeerAccess(j, 0));
MSG("Enabled P2P %d ->%d", i, j, access);
}
}
}
int *colIdx = (int *)malloc(ngpu * rows_per_gpu * nnz * sizeof(int));
int *colIdx1 = (int *)malloc(ngpu * rows_per_gpu * nnz * sizeof(int));
float *Val = (float *)malloc(ngpu * rows_per_gpu * nnz * sizeof(float));
float *data = (float *)malloc(cols_per_gpu1 * batch * sizeof(float));
float *out = (float *)malloc(rows_per_gpu * batch * sizeof(float));
//int *Idx_Count = (int *)malloc(cols_per_gpu * sizeof(int));
//for (int j = 0; j < cols_per_gpu; j++) Idx_Count[j] = 0;
for (int j = 0; j < ngpu * rows_per_gpu * nnz; j++) {
Val[j] = 1; // ((float)std::rand()) / RAND_MAX;
colIdx[j] = (((long long int)j*(long long int)(cols_per_gpu - 1)) % (long long int) cols_per_gpu);
colIdx1[j] = (((long long int)j*(long long int)(cols_per_gpu1 - 1)) % (long long int) cols_per_gpu1);
}
for (int j = 0; j < cols_per_gpu1 * batch; j++)
{
data[j] = 1.0f;
}
for (int j = 0; j < rows_per_gpu * batch; j++) out[j] = 0.0f;
CUDA(cudaMalloc((void **)&colIdx_dvc[i], ngpu * rows_per_gpu * nnz * sizeof(int)));
CUDA(cudaMalloc((void **)&colIdx1_dvc[i], ngpu * rows_per_gpu * nnz * sizeof(int)));
CUDA(cudaMalloc((void **)&Val_dvc[i], ngpu * rows_per_gpu * nnz * sizeof(float)));
CUDA(cudaMalloc((void **)&data_dvc[i], cols_per_gpu1 * batch * sizeof(float)));
CUDA(cudaMalloc((void **)&out_dvc[i], rows_per_gpu * batch * sizeof(float)));
CUDA(cudaMemcpy(colIdx_dvc[i], colIdx, ngpu * rows_per_gpu * nnz * sizeof(int), cudaMemcpyDefault));
CUDA(cudaMemcpy(colIdx1_dvc[i], colIdx1, ngpu * rows_per_gpu * nnz * sizeof(int), cudaMemcpyDefault));
CUDA(cudaMemcpy(Val_dvc[i], Val, ngpu * rows_per_gpu * nnz * sizeof(float), cudaMemcpyDefault));
CUDA(cudaMemcpy(data_dvc[i], data, cols_per_gpu1 * batch * sizeof(float), cudaMemcpyDefault));
CUDA(cudaMemcpy(out_dvc[i], out, rows_per_gpu * batch * sizeof(float), cudaMemcpyDefault));
free(out);
free(data);
free(Val);
free(colIdx);
free(colIdx1);
std::cout << "Gpu " << i << " allocated" << std::endl;
}
for (int i = 0; i < ngpu; i++)
{
CUDA(cudaSetDevice(i));
CUDA(cudaDeviceSynchronize());
}
//int pfac=19;
//int blockcnt = (int)ceil(100 * pow(1.5f, (double)pfac));
MSG("Peer To Peer mode enabled? %s", runp2pmode ? "true" : "false");
CUDA(cudaSetDevice(0));
CUDA(cudaDeviceSynchronize());
for (int pfac=0; pfac<20; pfac++)
{
int blockcnt = (int)ceil(100 * pow(1.5f, (double)pfac));
t.start();
ELLgemm(out_dvc, colIdx_dvc[0], Val_dvc[0], data_dvc[0], map, elems, nnz, batch, ngpu, blockcnt);
CUDA(cudaPeekAtLastError());
CUDA(cudaDeviceSynchronize());
MSG("%d blocks, Emul time Taken per iteration %f", blockcnt, t.stop());
}
float *out_hst = (float *)malloc(rows_per_gpu*batch * sizeof(float));
std::ofstream os1("./out_dvc1.csv", std::ios::binary | std::ios::out);
for (int i = 0; i < ngpu; i++)
{
CUDA(cudaSetDevice(i));
CUDA(cudaMemcpy(out_hst, out_dvc[i], rows_per_gpu*batch * sizeof(float), cudaMemcpyDefault));
os1.write(reinterpret_cast<const char*>(out_hst), std::streamsize(rows_per_gpu*batch * sizeof(float)));
}
CUDA(cudaSetDevice(0));
CUDA(cudaDeviceSynchronize());
for (int pfac=0; pfac<20; pfac++)
{
int blockcnt = (int)ceil(100 * pow(1.5f, (double)pfac));
t.start();
ELLgemm(out_dvc, colIdx1_dvc[0], Val_dvc[0], data_dvc[0], map, elems, nnz, batch, ngpu, blockcnt);
CUDA(cudaPeekAtLastError());
CUDA(cudaDeviceSynchronize());
MSG("%d blocks, Emul time Taken per iteration %f", blockcnt, t.stop());
}
std::ofstream os2("./out_dvc2.csv", std::ios::binary | std::ios::out);
for (int i = 0; i < ngpu; i++)
{
CUDA(cudaSetDevice(i));
CUDA(cudaMemcpy(out_hst, out_dvc[i], rows_per_gpu*batch * sizeof(float), cudaMemcpyDefault));
os2.write(reinterpret_cast<const char*>(out_hst), std::streamsize(rows_per_gpu*batch * sizeof(float)));
}
free(out_hst);
}
catch (std::exception &e)
{
{
std::cerr << " exception caught: " << e.what() << '\n';
}
}
catch (...)
{
//MSG("Unknown Error");
}
//std::cin.get();
}
```