Grayscale lossless JPEG encoding with nvJPEG

Hi,
I am experimenting with lossless compression with nvJPEG for grayscale buffers. Here is the code with lossy encoding and quality 100 that works:

#include <nvjpeg.h>

#include <cuda_runtime.h>

#include <cstdint>
#include <fstream>
#include <iostream>
#include <vector>

#define CHECK_CUDA(err)                                                        \
    {                                                                          \
        if (err != cudaSuccess) {                                              \
            std::cerr << __FILE__ << ":" << __LINE__                           \
                      << " - CUDA Error: " << cudaGetErrorString(err) << '\n'; \
            exit(1);                                                           \
        }                                                                      \
    }
#define CHECK_NVJPEG(err)                                    \
    {                                                        \
        if (err != NVJPEG_STATUS_SUCCESS) {                  \
            std::cerr << __FILE__ << ":" << __LINE__         \
                      << " - nvJPEG Error: " << err << '\n'; \
            exit(1);                                         \
        }                                                    \
    }

std::vector<uint8_t> readBinaryFile(const std::string& filename) {
    // Open the file in binary mode
    std::ifstream file(filename, std::ios::binary);

    // Check if the file was opened successfully
    if (!file) {
        std::cerr << "Error opening file: " << filename << '\n';
        return {};
    }

    // Get the size of the file
    file.seekg(0, std::ios::end);
    auto file_size = file.tellg();
    file.seekg(0, std::ios::beg);

    // Create a vector to hold the binary data
    std::vector<uint8_t> buffer(file_size);

    // Read the file into the vector
    file.read(reinterpret_cast<char*>(buffer.data()), file_size);

    // Close the file
    file.close();

    return buffer;
}

void writeBinaryFile(const std::string& filename,
                     const std::vector<uint8_t>& buffer) {
    // Open the file in binary mode
    std::ofstream file(filename, std::ios::binary);

    // Check if the file was opened successfully
    if (!file) {
        std::cerr << "Error opening file: " << filename << '\n';
        return;
    }

    // Write the file
    file.write(reinterpret_cast<const char*>(buffer.data()), buffer.size());

    // Close the file
    file.close();
}

int devMalloc(void*  /*ctx*/, void** p, size_t size, cudaStream_t stream) {
    return (int)cudaMallocAsync(p, size, stream);
}

int devFree(void* /*ctx*/, void* p, size_t /*size*/, cudaStream_t stream) {
    return (int)cudaFreeAsync(p, stream);
}

int pinnedMalloc(void* /*ctx*/, void** p, size_t size,
                 cudaStream_t /*stream*/) {
    return (int)cudaMallocHost(p, size);
}

int pinnedFree(void* /*ctx*/, void* p, size_t /*size*/,
               cudaStream_t /*stream*/) {
    return (int)cudaFreeHost(p);
}

int main(int  argc, char*  argv[]) {
    if (argc != 5) {
        std::cout << argv[0] << " input_file width height output_file\n";
        return EXIT_FAILURE;
    }

    std::string input_file = argv[1];
    std::string output_file = argv[4];
    size_t width = std::atoi(argv[2]);
    size_t height = std::atoi(argv[3]);

    auto input = readBinaryFile(input_file);

    if (input.empty()) {
        return EXIT_FAILURE;
    }

    if (input.size() != height * width) {
        std::cerr << "input.size(): " << input.size() << " != height * width ("
                  << height * width << ")\n";
        return EXIT_FAILURE;
    }

    cudaStream_t stream = nullptr;

    cudaEvent_t start_ev = nullptr;
    CHECK_CUDA(cudaEventCreateWithFlags(&start_ev, cudaEventDefault));

    cudaEvent_t end_ev = nullptr;
    CHECK_CUDA(cudaEventCreateWithFlags(&end_ev, cudaEventDefault));

    unsigned char* d_input = nullptr;

    // allocate GPU buffer for input image
    CHECK_CUDA(
        cudaMallocAsync(&d_input, input.size() * sizeof(uint8_t), stream));

    // copy input to GPU
    CHECK_CUDA(cudaMemcpyAsync(d_input, input.data(),
                               input.size() * sizeof(uint8_t),
                               cudaMemcpyHostToDevice, stream));

    nvjpegDevAllocatorV2_t dev_allocator = {&devMalloc, &devFree, nullptr};
    nvjpegPinnedAllocatorV2_t pinned_allocator = {&pinnedMalloc, &pinnedFree,
                                                  nullptr};

    nvjpegHandle_t nv_handle = nullptr;
    CHECK_NVJPEG(nvjpegCreateExV2(NVJPEG_BACKEND_LOSSLESS_JPEG, &dev_allocator,
                                  &pinned_allocator, NVJPEG_FLAGS_DEFAULT,
                                  &nv_handle));

    nvjpegEncoderState_t nv_enc_state = nullptr;
    CHECK_NVJPEG(nvjpegEncoderStateCreate(nv_handle, &nv_enc_state, stream));

    nvjpegEncoderParams_t nv_enc_params = nullptr;
    CHECK_NVJPEG(nvjpegEncoderParamsCreate(nv_handle, &nv_enc_params, stream));

    nvjpegImage_t imgdesc = {{d_input, nullptr}, {width, 0}};

    CHECK_NVJPEG(nvjpegEncoderParamsSetSamplingFactors(
        nv_enc_params, NVJPEG_CSS_GRAY, stream));

    int optimal = 0;  // no huffman optimization
    // int optimal = 1;  // huffman optimization little bit smaller but way slower!

    CHECK_NVJPEG(
        nvjpegEncoderParamsSetOptimizedHuffman(nv_enc_params, optimal, stream));

    int quality = 100;
    CHECK_NVJPEG(nvjpegEncoderParamsSetQuality(nv_enc_params, quality, stream));

    // --------------------- Set encoding -----------------------------

    // OK faster than NVJPEG_ENCODING_PROGRESSIVE_DCT_HUFFMAN and smaller files
    CHECK_NVJPEG(nvjpegEncoderParamsSetEncoding(
        nv_enc_params, NVJPEG_ENCODING_BASELINE_DCT, stream));

    // KO
    // CHECK_NVJPEG(nvjpegEncoderParamsSetEncoding(
    //     nv_enc_params, NVJPEG_ENCODING_EXTENDED_SEQUENTIAL_DCT_HUFFMAN,
    //     stream));

    // OK
    // CHECK_NVJPEG(nvjpegEncoderParamsSetEncoding(
    //     nv_enc_params, NVJPEG_ENCODING_PROGRESSIVE_DCT_HUFFMAN, stream));

    // KO
    // CHECK_NVJPEG(nvjpegEncoderParamsSetEncoding(
    //     nv_enc_params, NVJPEG_ENCODING_LOSSLESS_HUFFMAN, stream));

    // -----------------------------------------------------------------

    CHECK_CUDA(cudaEventRecord(start_ev, stream));

    // Compress image
    CHECK_NVJPEG(nvjpegEncodeYUV(nv_handle, nv_enc_state, nv_enc_params,
                                 &imgdesc, NVJPEG_CSS_GRAY, width, height,
                                 stream));

    CHECK_CUDA(cudaEventRecord(end_ev, stream));

    // reusing input for the output
    // if compression works, it should be large enough
    size_t encoded_length = input.size();
    CHECK_NVJPEG(nvjpegEncodeRetrieveBitstreamDevice(
        nv_handle, nv_enc_state, d_input, &encoded_length, stream));

    // not sure it is necessary
    CHECK_CUDA(cudaStreamSynchronize(stream));

    std::vector<uint8_t> output(encoded_length);

    CHECK_CUDA(cudaMemcpyAsync(output.data(), d_input,
                               encoded_length * sizeof(uint8_t),
                               cudaMemcpyDeviceToHost, stream));

    CHECK_CUDA(cudaFreeAsync(d_input, stream));

    CHECK_CUDA(cudaStreamSynchronize(stream));

    float duration_ms = 0;

    CHECK_CUDA(cudaEventElapsedTime(&duration_ms, start_ev, end_ev));

    std::cout << "JPEG encode duration: " << duration_ms << " ms / channel\n";

    // cleanup
    CHECK_NVJPEG(nvjpegEncoderParamsDestroy(nv_enc_params));
    CHECK_NVJPEG(nvjpegEncoderStateDestroy(nv_enc_state));
    CHECK_NVJPEG(nvjpegDestroy(nv_handle));

    CHECK_CUDA(cudaEventDestroy(end_ev));
    CHECK_CUDA(cudaEventDestroy(start_ev));

    // write file to disk
    writeBinaryFile(output_file, output);
}

If instead, I enable lossless compression with :

CHECK_NVJPEG(nvjpegEncoderParamsSetEncoding(
    nv_enc_params, NVJPEG_ENCODING_LOSSLESS_HUFFMAN, stream));

I get an error 2 (invalid parameter) on that line.

Any idea on how to activate lossless compression please?

Anyone with a solution to this problem please?