Hi All,
I’ve created a minimalistic sample program that captures the strange behavior of CUDA runtime API.
When compiled using “–default-stream=per-thread”, each host thread should be able to launch its own kernel and wait for result using cudaStreamSynchronize(0), in parallel.
But it turns out to be not true …
In the example code, when you set PARALLEL=false, you can observe perfect concurrency in GPU using visual profiler, as expected
When you set PARALLEL=true, you can observe complete serialization in GPU; when replacing cudaStreamSynchronize(0) by cudaMemcpy(outputh, outputd, sizeof(float), cudaMemcpyDeviceToHost) (and you need to uncomment the cudaMalloc and comment one cudaHostGetDevicePointer), concurrency increased somewhat, but still far from perfect concurrency.
The kernel is producing nothing interesting, so its logic can be ignored :-)
I’m using a single 1080 Ti, CUDA 10.1, Windows 7, Visual Studio Community 2017.
Any thoughts? Thanks for your time!
#include <iostream>
#include <cfloat>
#include <cmath>
#include <thread>
#include <atomic>
#include <vector>
#include <chrono>
#include <cuda_profiler_api.h>
const bool PARALLEL = true;
const int ArrayLength = 128;
const int LoopLength = 10240000;
const int ThreadsCount = 28;
std::atomic<bool> ready(false);
__global__ void kernel_inference(const float* __restrict__ input, float* __restrict__ output)
{
for(int x=threadIdx.x;x<ArrayLength;x+=blockDim.x)
{
float tmp = input[x];
for(int i=0;i<LoopLength;i++)
{
tmp = sinf(tmp*tmp+tmp+1.0f);
}
atomicAdd(output, tmp);
}
}
struct Portal
{
cudaStream_t stream;
float* inputh;
float* inputd;
float* outputh;
float* outputd;
Portal()
{
// malloc
cudaHostAlloc(&inputh, ArrayLength*sizeof(float), cudaHostAllocMapped);
cudaHostAlloc(&outputh, sizeof(float), cudaHostAllocMapped);
//cudaMalloc(&outputd, sizeof(float));
// get pointer
cudaHostGetDevicePointer(&inputd, inputh, 0);
cudaHostGetDevicePointer(&outputd, outputh, 0);
// stream
if (!PARALLEL) cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
}
~Portal()
{
cudaFreeHost(inputh);
cudaFreeHost(outputh);
//cudaFree(outputd);
if (!PARALLEL) cudaStreamDestroy(stream);
}
float query(float v)
{
for(int i=0;i<ArrayLength;i++)
inputh[i] = v+i;
*outputh = 0;
kernel_inference<<<1,128,0,PARALLEL?0:stream>>>(inputd, outputd);
//if (PARALLEL) cudaMemcpy(outputh, outputd, sizeof(float), cudaMemcpyDeviceToHost);
if (PARALLEL) cudaStreamSynchronize(0);
return PARALLEL ? *outputh : 0;
}
};
void thread_main(int t)
{
Portal portal;
while (!ready)
{
std::this_thread::sleep_for(std::chrono::milliseconds(1));
}
std::cout<<portal.query(t)<<std::endl;
}
int main(int argc, char** argv)
{
std::vector<std::unique_ptr<std::thread>> threads;
std::vector<std::unique_ptr<Portal>> portals;
if (PARALLEL)
{
for(int t=0;t<ThreadsCount;t++)
threads.push_back(std::make_unique<std::thread>(thread_main, t));
}
else
{
for(int t=0;t<ThreadsCount;t++)
portals.push_back(std::make_unique<Portal>());
}
cudaDeviceSynchronize();
ready = true;
if (PARALLEL)
{
for(int t=0;t<threads.size();t++)
threads[t]->join();
}
else
{
for(int t=0;t<portals.size();t++)
portals[t]->query(t);
for(int t=0;t<portals.size();t++)
{
cudaStreamSynchronize(portals[t]->stream);
std::cout<<*(portals[t]->outputh)<<std::endl;
}
}
}
compilation command
CALL "C:\Program Files (x86)\Microsoft Visual Studio017\Community\VC\Auxiliary\Build\vcvars64.bat"
nvcc --gpu-architecture=sm_61 --default-stream=per-thread --optimize=3 -Xcompiler "/wd4819" --x=cu test.cpp --use_fast_math --library=cuda,cudart_static --library-path="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.1\lib\x64" --output-file test.exe