Problem trying to compress a D3D11 staging texture without going back to CPU RAM first

Hello there everyone,

For a project, I need to be able to quickly transfer render textures to the CPU side, but an uncompressed texture will obviously be very large in RGBA32 format, so I want to compress it using nvcomp before copying the texture to the CPU side.

However, I have found it incredibly difficult and hard to debug so far; only managing to get it to work by first copying the texture to CPU and then compressing it on the GPU, but that obviously defeats the whole point of doing it in the first place.

First, I prepare the texture to be read out by creating a staging texture and getting a pointer to the data:

ID3D11Texture2D* texture = (ID3D11Texture2D*)textureObject;

// Create a staging texture that we can copy the texture data to
D3D11_TEXTURE2D_DESC desc;
texture->GetDesc(&desc);
desc.BindFlags = 0;
desc.Usage = D3D11_USAGE_STAGING;
//desc.CPUAccessFlags = D3D11_CPU_ACCESS_READ; //comment out if not using direct CPU reading.
ID3D11Texture2D* stagingTexture = nullptr;
HRESULT hr = m_Device->CreateTexture2D(&desc, nullptr, &stagingTexture);
if (FAILED(hr)) {
	// Handle error
	return;
}

// Copy texture data from the original texture to the staging texture
ID3D11DeviceContext* context = nullptr;
m_Device->GetImmediateContext(&context);
context->CopyResource(stagingTexture, texture);

// Map the staging texture and read the pixel data
D3D11_MAPPED_SUBRESOURCE mappedResource;
hr = context->Map(stagingTexture, 0, D3D11_MAP_READ, 0, &mappedResource);
if (FAILED(hr)) {
	// Handle error
	stagingTexture->Release();
	context->Release();
	return;
}

unsigned char* src = (unsigned char*)mappedResource.pData;

Normally speaking, you could just memcpy() the “src” variable at the end there, and you’ll have the texture on CPU, but it’ll be uncompressed and transferring a large amount of data will be slow.

So here is where NVCOMP comes in:

cudaSetDevice(0);

cudaStream_t stream = nullptr;
CUDA_CHECK(cudaStreamCreate(&stream), "cudaStreamCreate");

//Create a snappy compression manager:
const int chunk_size = 1 << 16;
nvcompType_t type = NVCOMP_TYPE_UCHAR;

nvcompBatchedSnappyOpts_t format_opts{ type };
nvcomp::SnappyManager nvcomp_manager{ chunk_size, format_opts, stream };
nvcomp::CompressionConfig comp_config = nvcomp_manager.configure_compression(bufferSize);

//Allocate a buffer on the GPU to hold our uncompressed object:
size_t uncompressedSize = bufferSize;
uint8_t* uncompressedBuffer;
CUDA_CHECK(cudaMalloc(&uncompressedBuffer, uncompressedSize), "alloc uncomp");

//Copy the texture to our uncompressed buffer:
CUDA_CHECK(cudaMemcpy(uncompressedBuffer, src, bufferSize, cudaMemcpyKind::cudaMemcpyDeviceToDevice), "copy to uncomp");

//Allocate a buffer on the GPU to hold our compressed object:
size_t compressedSize = comp_config.max_compressed_buffer_size;
uint8_t* compressedBuffer;
CUDA_CHECK(cudaMalloc(&compressedBuffer, compressedSize), "cudaMalloc(compressedBuffer)");

//Sync streams and compress the data on the GPU:
CUDA_CHECK(cudaStreamSynchronize(stream), "cudaStreamSync before compress");
nvcomp_manager.compress(uncompressedBuffer, compressedBuffer, comp_config);
CUDA_CHECK(cudaStreamSynchronize(stream), "cudaStreamSync after compress");

//Remove existing output buffer and replace with our compressed file:
//Allocate a buffer to hold the compressed file on the CPU side as well:
compressedSize = nvcomp_manager.get_compressed_output_size(compressedBuffer); 
std::vector<uint8_t> dataCompressed(compressedSize);

//Copy the data from the GPU to the CPU:
CUDA_CHECK(cudaMemcpy(dataCompressed.data(), compressedBuffer, compressedSize, cudaMemcpyKind::cudaMemcpyDeviceToHost), "cudaMemcpy - gpu to cpu");

If anyone here can spot any obvious mistakes, I’d love to hear it! So far I have not been able to figure out why this wouldn’t work so I’d really appreciate any feedback

Thank you in advance!