I am working on an image processing application which makes liberal use of CPU threads, CUDA streams and async memcpy operations to keep the GPU utilization high. One part uses a thread pool (or std::async in the reproduce case) to count the non-zero pixels in tiles of an image using nppiCountInRange.
This code runs fine when:
- CPU threading is disabled
- NPP uses the default stream (nppSetStream(cudaStreamPerThread); is not called after context creation)
Minimal reproduce case:
#include <algorithm>
#include <cassert>
#include <cuda.h>
#include <future>
#include <iostream>
#include <nppcore.h>
#include <nppi.h>
#include <tuple>
#include <vector>
std::ostream& operator<<(std::ostream& os, const NppiSize& sz)
{
os << "(" << sz.width << "," << sz.height << ")";
return os;
}
std::ostream& operator<<(std::ostream& os, const NppiPoint& p)
{
os << "(" << p.x << "," << p.y << ")";
return os;
}
#define CHECK_NPP(rc) \
do \
{ \
const auto _rc = rc; \
assert(_rc == NPP_NO_ERROR); \
} while (false);
#define CHECK_CU(rc) \
do \
{ \
const auto _rc = rc; \
assert(_rc == CUDA_SUCCESS); \
} while (false)
int main(int argc, char** argv)
{
if (argc < 2)
{
std::cerr << "Usage: " << argv[0] << " D" << std::endl << " where D is the CUDA device ID" << std::endl;
return 1;
}
CHECK_CU(cuInit(0));
CUdevice cuDevice;
CHECK_CU(cuDeviceGet(&cuDevice, atoi(argv[3])));
CUcontext cuContext;
CHECK_CU(cuCtxCreate(&cuContext, CU_CTX_SCHED_AUTO, cuDevice));
nppSetStream(cudaStreamPerThread);
const auto inputDim = NppiSize{1920, 1088};
int inputPitch;
auto inputData = nppiMalloc_8u_C1(inputDim.width, inputDim.height, &inputPitch);
assert(inputData);
std::cout << "Input image: " << inputDim << ", pitch: " << inputPitch << std::endl;
CHECK_NPP(nppiSet_8u_C1R(0, inputData, inputPitch, inputDim));
const auto tileDim = NppiSize{16, 16};
std::vector<std::tuple<NppiPoint, std::future<int32_t>>> results;
for (int y = 0; y < inputDim.height; y += tileDim.height)
{
for (int x = 0; x < inputDim.width; x += tileDim.width)
{
const auto pt = NppiPoint{x, y};
results.push_back(std::make_tuple(
pt, std::async([=]() {
CHECK_CU(cuCtxPushCurrent(cuContext));
const auto clampedDim = NppiSize{std::min(tileDim.width, inputDim.width - pt.x),
std::min(tileDim.height, inputDim.height - pt.y)};
auto tileData = inputData + pt.y * inputPitch + pt.x;
int bufferSize;
CHECK_NPP(nppiCountInRangeGetBufferHostSize_8u_C1R(inputDim, &bufferSize));
CUdeviceptr buffer_d;
CHECK_CU(cuMemAlloc(&buffer_d, bufferSize));
CUdeviceptr result_d;
CHECK_CU(cuMemAlloc(&result_d, sizeof(int)));
CHECK_NPP(nppiCountInRange_8u_C1R(tileData, inputPitch, clampedDim, reinterpret_cast<int*>(result_d), 1,
255, reinterpret_cast<Npp8u*>(buffer_d)));
int result;
CHECK_CU(cuMemcpyDtoH(&result, result_d, sizeof(int)));
CHECK_CU(cuMemFree(result_d));
CHECK_CU(cuMemFree(buffer_d));
return result;
})));
}
}
for (auto& r : results)
{
std::cout << std::get<NppiPoint>(r) << ": " << std::get<std::future<int32_t>>(r).get() << std::endl;
}
nppiFree(inputData);
return 0;
}