This program produces fine output without the use of Optix Denoiser, but when I use the denoiser to process the image it simply outputs a copy of the alpha image with all rgb values equals to 0.
Here is what the initialization of the denoiser looks like:
Renderer::Renderer(const int2 &size) {
initOptix();
createContext();
getSourceCode();
createModule();
raygenPGs.resize(1);
createRaygenProgram(0, "__raygen__renderFrame");
missPGs.resize(RAY_TYPE_COUNT);
hitgroupPGs.resize(RAY_TYPE_COUNT);
createMissProgram(INTERSECT_RAY_TYPE, "__miss__radiance");
createHitgroupProgram(INTERSECT_RAY_TYPE, "__closesthit__radiance", "__anyhit__radiance");
createPipeline();
paramsBuffer.alloc(sizeof(params));
fbSize = size;
renderBuffer = new sutil::CUDAOutputBuffer<float4>(
sutil::CUDAOutputBufferType::GL_INTEROP,
fbSize.x, fbSize.y
);
renderBuffer->setStream(stream);
outputBuffer.alloc(sizeof(vec4f) * fbSize.x * fbSize.y);
prevBuffer.alloc(sizeof(vec4f) * fbSize.x * fbSize.y);
albedoBuffer.alloc(sizeof(vec4f) * (fbSize.x / 2) * (fbSize.y / 2));
normalBuffer.alloc(sizeof(vec4f) * (fbSize.x / 2) * (fbSize.y / 2));
colorBuffer.alloc(sizeof(vec4f) * (fbSize.x / 2) * (fbSize.y / 2));
flowBuffer.alloc(sizeof(float4) * (fbSize.x / 2) * (fbSize.y / 2));
flowTrustworthinessBuffer.alloc(sizeof(float4) * (fbSize.x / 2) * (fbSize.y / 2));
randomBuffer.alloc(sizeof(curandStateXORWOW_t) * (fbSize.x / 2) * (fbSize.y / 2));
params.initialized = false;
OptixDenoiserOptions dnOptions;
dnOptions.guideAlbedo = 1;
dnOptions.guideNormal = 1;
dnOptions.denoiseAlpha = OPTIX_DENOISER_ALPHA_MODE_COPY;
OPTIX_CHECK(optixDenoiserCreate(optixContext, OPTIX_DENOISER_MODEL_KIND_TEMPORAL_UPSCALE2X, &dnOptions, &denoiser));
OPTIX_CHECK(optixDenoiserComputeMemoryResources(denoiser, fbSize.x, fbSize.y, &dnSizes));
CUDA_CHECK(cudaMalloc((void **)&dnState, dnSizes.stateSizeInBytes));
CUDA_CHECK(cudaMalloc((void **)&dnScratch, dnSizes.withoutOverlapScratchSizeInBytes));
OPTIX_CHECK(optixDenoiserSetup(
denoiser, stream,
fbSize.x, fbSize.y,
dnState, dnSizes.stateSizeInBytes,
dnScratch, dnSizes.withoutOverlapScratchSizeInBytes
));
CUDA_CHECK(cudaMalloc((void **)&internalIn, dnSizes.internalGuideLayerPixelSizeInBytes * fbSize.x * fbSize.y));
CUDA_CHECK(cudaMalloc((void **)&internalOut, dnSizes.internalGuideLayerPixelSizeInBytes * fbSize.x * fbSize.y));
CUDA_CHECK(cudaMalloc((void **)&intensityPtr, sizeof(float)));
CUDA_CHECK(cudaMalloc((void **)&avgColorPtr, sizeof(float3)));
dnParams.hdrIntensity = intensityPtr;
dnParams.blendFactor = 0.0f;
dnParams.hdrAverageColor = avgColorPtr;
dnParams.temporalModeUsePreviousLayers = 0;
CUDA_SYNC_CHECK();
std::cout << "Successfully set up renderer." << std::endl;
}
The rendering part looks like this:
void Renderer::render() {
CUDA_SYNC_CHECK();
if (!params.frameIndex) params.preCamera = params.camera;
params.frame.fbSize = make_int2(fbSize.x / 2, fbSize.y / 2);
params.frame.randomBuffer = (curandStateXORWOW_t *)randomBuffer.d_pointer();
params.frame.prevBuffer = (vec4f *)prevBuffer.d_pointer();
params.frame.albedoBuffer = (vec4f *)albedoBuffer.d_pointer();
params.frame.normalBuffer = (vec4f *)normalBuffer.d_pointer();
params.frame.colorBuffer = (vec4f *)colorBuffer.d_pointer();
params.frame.flowBuffer = (float4 *)flowBuffer.d_pointer();
params.frame.flowTrustworthinessBuffer = (float4 *)flowTrustworthinessBuffer.d_pointer();
paramsBuffer.upload(¶ms, 1);
++ params.frameIndex;
CUDA_SYNC_CHECK();
OPTIX_CHECK(optixLaunch(
pipeline, stream,
paramsBuffer.d_pointer(),
paramsBuffer.sizeInBytes,
&sbt,
fbSize.x / 2, fbSize.y / 2,
1
));
CUDA_SYNC_CHECK();
OptixDenoiserGuideLayer guideLayer;
guideLayer.albedo = createOptixImageF4(albedoBuffer.d_pointer(), fbSize.x / 2, fbSize.y / 2);
guideLayer.normal = createOptixImageF4(normalBuffer.d_pointer(), fbSize.x / 2, fbSize.y / 2);
guideLayer.flow = createOptixImageF4(flowBuffer.d_pointer(), fbSize.x / 2, fbSize.y / 2);
guideLayer.flowTrustworthiness = createOptixImageF4(flowTrustworthinessBuffer.d_pointer(), fbSize.x / 2, fbSize.y / 2);
guideLayer.previousOutputInternalGuideLayer = OptixImage2D{
internalIn,
(unsigned int)fbSize.x, (unsigned int)fbSize.y,
(unsigned int)dnSizes.internalGuideLayerPixelSizeInBytes * fbSize.x,
(unsigned int)dnSizes.internalGuideLayerPixelSizeInBytes,
OPTIX_PIXEL_FORMAT_INTERNAL_GUIDE_LAYER
};
guideLayer.outputInternalGuideLayer = OptixImage2D{
internalOut,
(unsigned int)fbSize.x, (unsigned int)fbSize.y,
(unsigned int)dnSizes.internalGuideLayerPixelSizeInBytes * fbSize.x,
(unsigned int)dnSizes.internalGuideLayerPixelSizeInBytes,
OPTIX_PIXEL_FORMAT_INTERNAL_GUIDE_LAYER
};
OptixDenoiserLayer layer;
layer.output = createOptixImageF4(outputBuffer.d_pointer(), fbSize.x, fbSize.y);
layer.previousOutput = createOptixImageF4(prevBuffer.d_pointer(), fbSize.x, fbSize.y);
layer.input = createOptixImageF4(colorBuffer.d_pointer(), fbSize.x / 2, fbSize.y / 2);
layer.type = OPTIX_DENOISER_AOV_TYPE_NONE;
OPTIX_CHECK(optixDenoiserInvoke(
denoiser, stream,
&dnParams,
dnState, dnSizes.stateSizeInBytes,
&guideLayer, &layer, 1,
0, 0,
dnScratch, dnSizes.withoutOverlapScratchSizeInBytes
));
CUDA_SYNC_CHECK();
CUDA_CHECK(cudaMemcpyAsync((void*)prevBuffer.d_pointer(), (vec4f *)outputBuffer.d_pointer(), outputBuffer.sizeInBytes, cudaMemcpyDeviceToDevice, stream));
CUDA_CHECK(cudaMemcpyAsync(renderBuffer->map(), (vec4f *)outputBuffer.d_pointer(), outputBuffer.sizeInBytes, cudaMemcpyDeviceToDevice, stream));
renderBuffer->unmap();
CUDA_SYNC_CHECK();
params.initialized = true;
params.preCamera = params.camera;
dnParams.temporalModeUsePreviousLayers = 1;
}
This program uses an encapsulation of the device pointers:
struct CUDABuffer {
inline CUdeviceptr d_pointer() const {
return (CUdeviceptr)d_ptr;
}
//! re-size buffer to given number of bytes
void resize(size_t size) {
if (d_ptr) free();
alloc(size);
}
//! allocate to given number of bytes
void alloc(size_t size) {
assert(!allocated);
this->sizeInBytes = size;
CUDA_CHECK(cudaMalloc((void **)&d_ptr, sizeInBytes));
allocated = true;
}
//! free allocated memory
void free() {
assert(allocated);
if (d_ptr != nullptr) {
CUDA_CHECK(cudaFree(d_ptr));
d_ptr = nullptr;
}
sizeInBytes = 0;
allocated = false;
}
template<typename T>
void alloc_and_upload(const std::vector<T> &vt) {
alloc(vt.size() * sizeof(T));
upload((const T *)vt.data(), vt.size());
}
template<typename T>
void upload(const T *t, size_t count) {
assert(allocated);
assert(sizeInBytes == count * sizeof(T));
if (d_ptr != nullptr) {
CUDA_CHECK(cudaMemcpy(d_ptr, (void *)t, count * sizeof(T), cudaMemcpyHostToDevice));
}
}
template<typename T>
void download(T *t, size_t count) {
assert(allocated);
assert(sizeInBytes == count * sizeof(T));
if (d_ptr != nullptr) {
CUDA_CHECK(cudaMemcpy((void *)t, d_ptr, count * sizeof(T), cudaMemcpyDeviceToHost));
}
}
size_t sizeInBytes{ 0 };
void *d_ptr{ nullptr };
bool allocated{ 0 };
};