Hello,
I did some benchmarking of different architectures to deploy some CUDA enabled software and this raised some questions I’m unable to answer. I hope someone here might have a clue.
The architectures studied here for deployment are:
- Multi-threaded application using 1 cuda stream per thread, MPS not running (“threads” in plot legend)
- Multiple instances of a single threaded application running over MPS (“processes + MPS” in plot legend)
- Multi-threaded application using 1 cuda stream per thread running over MPS (“threads + MPS” in plot legend)
- Multiple instances of a multi-threaded application running over MPS (not drawn in plot to keep sample code simple enough. This gives the better throughput for the actual workload we have using 2-threads per process, 16 processes over MPS)
The work (c.f. doWork function in code below) consist of copying memory from host to device, running a bunch of CUDA kernels, copying some memory back from the device. I’ve tested different things like using pageable memory or page-locked/pinned memory on the host side. When host side memory is pageable, I’ve tested with and without an explicit stream synchronization before the cudaMemcpyAsync(DeviceToHost).
Here’s a plot of the different experiments (all done on Ubuntu 16.04, CUDA 9.2, Drivers 396.37, Tesla P40, 2 x Xeon E5-2680 v4 - 56 threads total - , all processes started with affinity on the processor closest to the GPU):
https://user-images.githubusercontent.com/9768336/44277788-121e7400-a24c-11e8-995c-f0da744f0db3.png
Here is the code used to draw the plot, compiled with nvcc --machine 64 -O3 --use_fast_math -gencode=“arch=compute_61,code="sm_61,compute_61"” -o testConcurrency testConcurrency.cu
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <time.h>
#include <stdint.h>
#include <inttypes.h>
#include <pthread.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <unistd.h>
#define NBADD 220
#define NBINNERITER 1000
#define NBOUTERITER 8
#define LEN (352 * 1024)
#define checkCudaErrors(val) check( (val), #val, __FILE__, __LINE__ )
static void check(cudaError_t result, char const *const func, const char *const file, int const line)
{
if (result != cudaSuccess) {
fprintf(stderr, "CUDA error at %s:%d code=%d \"%s\" \n", file, line, static_cast<unsigned int>(result), func);
cudaDeviceReset();
// Make sure we call CUDA Device Reset before exiting
exit(EXIT_FAILURE);
}
}
static uint64_t getTicksMicroSeconds()
{
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts);
return ((uint64_t)ts.tv_sec) * UINT64_C(1000000) + (uint64_t)(ts.tv_nsec / 1000);
}
__device__ int4 nop4(int4 v)
{
v.x = __sad(v.x, v.x, v.x);
v.y = __sad(v.y, v.y, v.y);
v.z = __sad(v.z, v.z, v.z);
v.w = __sad(v.w, v.w, v.w);
return v;
}
__global__ void kernelAddConstant(const int* pSrc, int* pDst, const int value, const int length)
{
const int4* pSrcI4 = (const int4*)pSrc;
int4* pDstI4 = (int4*)pDst;
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int gridSize = blockDim.x * gridDim.x;
for (int i = idx; i < length/4; i += gridSize) {
int4 v = pSrcI4[i];
v.x += value;
v.y += value;
v.z += value;
v.w += value;
if (idx == i) {
for (int j = 0; j < 3072; ++j) {
v = nop4(v);
}
}
pDstI4[i] = v;
}
}
static int doWork(const int* cpuExpected, const int* cpuInput, int* cpuOutput, const size_t len, int* gpuInput, int* gpuOutput, cudaStream_t stream, int usePinnedMemory, int noSync)
{
checkCudaErrors(cudaMemcpyAsync(gpuInput, cpuInput, len * sizeof(int), cudaMemcpyHostToDevice, stream));
for (int i = 0; i < NBADD; ++i) {
kernelAddConstant<<<dim3(5), dim3(64), 0, stream>>>(gpuInput, gpuOutput, i, (int)(len / 64));
int* tmp = gpuInput;
gpuInput = gpuOutput;
gpuOutput = tmp;
}
if (!usePinnedMemory && !noSync) {
checkCudaErrors(cudaStreamSynchronize(stream));
}
checkCudaErrors(cudaMemcpyAsync(cpuOutput, gpuOutput, len * sizeof(int), cudaMemcpyDeviceToHost, stream));
if (usePinnedMemory) {
checkCudaErrors(cudaStreamSynchronize(stream));
}
return memcmp(cpuOutput, cpuExpected, (len / 64) * sizeof(int));
}
typedef struct {
int const* cpuExpected;
size_t len;
uint64_t volatile* elapsed;
int usePinnedMemory;
int noSync;
} runThreadArgs;
static void* runThread(void* argsVoid)
{
runThreadArgs* args = (runThreadArgs*)argsVoid;
const int* cpuExpected = args->cpuExpected;
const size_t len = args->len;
const int usePinnedMemory = args->usePinnedMemory;
const int noSync = args->noSync;
int* cpuInput;
int* cpuOutput;
int* gpuInput;
int* gpuOutput;
cudaStream_t stream;
if (!usePinnedMemory) {
cpuInput = (int*)malloc(len * sizeof(int));
cpuOutput = (int*)malloc(len * sizeof(int));
}
else {
checkCudaErrors(cudaMallocHost(&cpuInput, len * sizeof(int)));
checkCudaErrors(cudaMallocHost(&cpuOutput, len * sizeof(int)));
}
if ((cpuInput == NULL) || (cpuOutput == NULL)) {
fprintf(stderr, "Allocation failure\n");
exit(EXIT_FAILURE);
}
memset(cpuInput, 0, len * sizeof(int));
checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
checkCudaErrors(cudaMalloc(&gpuInput, len * sizeof(int)));
checkCudaErrors(cudaMalloc(&gpuOutput, len * sizeof(int)));
*args->elapsed = 1U;
while (*args->elapsed == 1U);
uint64_t start = getTicksMicroSeconds();
for (int i = 0; i < NBINNERITER; ++i) {
if (doWork(cpuExpected, cpuInput, cpuOutput, len, gpuInput, gpuOutput, stream, usePinnedMemory, noSync)) {
fprintf(stderr, "Invalid result, iter %d\n", i);
exit(EXIT_FAILURE);
}
}
uint64_t stop = getTicksMicroSeconds();
*args->elapsed = stop - start;
checkCudaErrors(cudaFree(gpuInput));
checkCudaErrors(cudaFree(gpuOutput));
checkCudaErrors(cudaStreamDestroy(stream));
if (!usePinnedMemory) {
free(cpuInput);
free(cpuOutput);
}
else {
checkCudaErrors(cudaFreeHost(cpuInput));
checkCudaErrors(cudaFreeHost(cpuOutput));
}
return NULL;
}
int main(int argc, char* argv[])
{
const size_t len = LEN;
int* cpuExpected;
pthread_t thread[32];
runThreadArgs args[32];
pid_t pid[32];
uint64_t volatile* pElapsed;
pthread_attr_t attr;
if (argc < 5) {
exit(EXIT_FAILURE);
}
int useFork = atoi(argv[1]);
int usePinnedMemory = atoi(argv[2]);
int noSync = atoi(argv[3]);
argc -= 4;
argv += 4;
cpuExpected = (int*)malloc(len *sizeof(int));
if (cpuExpected == NULL) {
fprintf(stderr, "Allocation failure\n");
exit(EXIT_FAILURE);
}
for (size_t i = 0; i < len; ++i) {
cpuExpected[i] = ((NBADD-1) * (NBADD-2)) / 2;
}
pElapsed = (uint64_t volatile*)mmap( 0, (sizeof(args) / sizeof(args[0])) * sizeof(uint64_t), PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1 /*fd*/, 0);
if (pElapsed == MAP_FAILED) {
fprintf(stderr, "mmap failed\n");
exit(EXIT_FAILURE);
}
if (useFork) {
fprintf(stdout, "Using sub-processes\n");
}
else {
fprintf(stdout, "Using threads\n");
cudaFree(NULL); /* init runtime API */
pthread_attr_init(&attr);
pthread_attr_setstacksize( &attr, 1024 * 1024);
pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
}
for (size_t t = 0; t < (sizeof(args) / sizeof(args[0])); ++t) {
args[t].cpuExpected = cpuExpected;
args[t].len = len;
args[t].elapsed = pElapsed + t;
args[t].usePinnedMemory = usePinnedMemory;
args[t].noSync = noSync;
}
while (argc > 0)
{
double maxThroughput = 0.0F;
size_t nbThreads = atoi(argv[0]);
argc--;
argv++;
for (int i = 0; i < NBOUTERITER; ++i)
{
memset((void*)pElapsed, 0, (sizeof(args) / sizeof(args[0])) * sizeof(uint64_t));
if (useFork) {
memset(pid, 0, sizeof(pid));
for (size_t t = 0; t < nbThreads; ++t) {
pid[t] = fork();
if (pid[t] == 0) {
/* I'm a child process */
cudaFree(NULL); /* init runtime API */
runThread(args + t);
exit(EXIT_SUCCESS);
}
}
/* Parent process */
/* wait process ready */
for (size_t t = 0; t < nbThreads; ++t) {
while (*args[t].elapsed != 1U);
}
/* release processes */
for (size_t t = 0; t < nbThreads; ++t) {
*args[t].elapsed = 0U;
}
int status = EXIT_SUCCESS;
for (size_t t = 0; t < nbThreads; ++t) {
if (pid[t] < 0) {
/* fork failed */
status = EXIT_FAILURE;
}
else {
int wstatus;
waitpid(pid[t], &wstatus, 0);
if (WIFEXITED(wstatus)) {
if (WEXITSTATUS(wstatus) != EXIT_SUCCESS) {
status = EXIT_FAILURE;
}
}
else {
status = EXIT_FAILURE;
}
}
}
if (status != EXIT_SUCCESS) {
exit(EXIT_FAILURE);
}
}
else {
for (size_t t = 0; t < nbThreads; ++t) {
pthread_create(thread + t, &attr, &runThread, (void*)(args + t));
}
/* wait process ready */
for (size_t t = 0; t < nbThreads; ++t) {
while (*args[t].elapsed != 1U);
}
/* release processes */
for (size_t t = 0; t < nbThreads; ++t) {
*args[t].elapsed = 0U;
}
for (size_t t = 0; t < nbThreads; ++t) {
pthread_join(thread[t], NULL);
}
}
uint64_t elapsed = 0;
for (size_t t = 0; t < nbThreads; ++t) {
elapsed += pElapsed[t];
}
double throughput = (1000000.0 / (double)elapsed) * ((double)NBINNERITER * (double)(nbThreads * nbThreads));
if (throughput > maxThroughput) {
maxThroughput = throughput;
}
}
fprintf(stdout, "%d threads - %f ips\n", (int)nbThreads, maxThroughput);
}
munmap((void*)pElapsed, (sizeof(args) / sizeof(args[0])) * sizeof(uint64_t));
return EXIT_SUCCESS;
}
Obviously, the kernel does nothing interesting here. It has only been tuned to more or less mimic the actual workload.
One thing to be noted is the number of kernel launch for each iteration. I think it’s quite high but I have little to no leverage to reduce this number and it does not change the fact that multi-process + MPS has higher throughput than multi-thread.
I can attach some profiling data in the multi-thread case (mostly, it shows average time going up for cudaLaunchKernel and cudaMemcpyAsync while cudaStreamSynchronize average time goes down)
Thanks