Gstdsexample turn tracked obj into base64 and Add to the DsMeta

Hi there,

I tried to export the deepstream detected obj to the message Queue which managed by kafka(using gst-nvmsgbroker plugin),

the problem I’ve mat is that the nppiResizeSqrPixel_8u_C3R is really slow ,my fps drop to 1-2(over 100fps without it),so I tried to use dsexample->inter_buf to do the conversion. convert to cv::Mat is fine,but cv::imencode to jpg format with the mat caused a coredump. Is this because that the data stored in the GPU mem ?

AND if there is a way to encode the data and turn it to base64 directly with the GPU?

SL

why you need to do resize nppiResizeSqrPixel_8u_C3R?
You can refer to cuda sample to do jpeg encoding by cuda /usr/local/cuda/samples/7_CUDALibraries/jpegNPP
Deepstream 4.0 will be released. It has h265 h264 gst-v4l2 encoding, but no jpeg gst-plugin encoding.

thanks chris,

I am following the code in jpegNPP sample to encode the buffer data to jpeg. but I’ve no idea how to fit the NV21 data into the apDstImage, so I pick the data from nppiNV12ToRGB_8u_P2C3R get RGB frame anf convert to YUV to encode it to jpeg. but I ended up with a disorder output. any thing wrong?

refer to some others’work,my code is here :

int jpegNPP(Npp8u* d_srcRGB, int img_width, int img_height)
{
    NppiDCTState *pDCTState;
    CHECK_NPP_STATUS(nppiDCTInitAlloc(&pDCTState),"");

    // Parsing and Huffman Decoding (on host)
    FrameHeader oFrameHeader;
    QuantizationTable aQuantizationTables[4];
    Npp8u *pdQuantizationTables;
    cudaMalloc(&pdQuantizationTables, 64 * 4);

    HuffmanTable aHuffmanTables[4];
    HuffmanTable *pHuffmanDCTables = aHuffmanTables;
    HuffmanTable *pHuffmanACTables = &aHuffmanTables[2];
    ScanHeader oScanHeader;
    memset(&oFrameHeader,0,sizeof(FrameHeader));
    memset(aQuantizationTables,0, 4 * sizeof(QuantizationTable));
    memset(aHuffmanTables,0, 4 * sizeof(HuffmanTable));
    int nMCUBlocksH = 0;
    int nMCUBlocksV = 0;

    int nRestartInterval = -1;

    NppiSize aSrcSize[3];
    Npp16s *apdDCT[3] = {0,0,0};
    Npp32s aDCTStep[3];

    Npp8u *apSrcImage[3] = {0,0,0};
    Npp32s aSrcImageStep[3];
	size_t aSrcPitch[3];

    /***************************
    *
    *   Output   
    *
    ***************************/

	unsigned char STD_DC_Y_NRCODES[16] = { 0, 1, 5, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0 };
	unsigned char STD_DC_Y_VALUES[12] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 };

	unsigned char STD_DC_UV_NRCODES[16] = { 0, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0 };
	unsigned char STD_DC_UV_VALUES[12] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 };

	unsigned char STD_AC_Y_NRCODES[16] = { 0, 2, 1, 3, 3, 2, 4, 3, 5, 5, 4, 4, 0, 0, 1, 0X7D };
	unsigned char STD_AC_Y_VALUES[162] =
	{
		0x01, 0x02, 0x03, 0x00, 0x04, 0x11, 0x05, 0x12,
		0x21, 0x31, 0x41, 0x06, 0x13, 0x51, 0x61, 0x07,
		0x22, 0x71, 0x14, 0x32, 0x81, 0x91, 0xa1, 0x08,
		0x23, 0x42, 0xb1, 0xc1, 0x15, 0x52, 0xd1, 0xf0,
		0x24, 0x33, 0x62, 0x72, 0x82, 0x09, 0x0a, 0x16,
		0x17, 0x18, 0x19, 0x1a, 0x25, 0x26, 0x27, 0x28,
		0x29, 0x2a, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39,
		0x3a, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49,
		0x4a, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59,
		0x5a, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69,
		0x6a, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79,
		0x7a, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89,
		0x8a, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98,
		0x99, 0x9a, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
		0xa8, 0xa9, 0xaa, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6,
		0xb7, 0xb8, 0xb9, 0xba, 0xc2, 0xc3, 0xc4, 0xc5,
		0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xd2, 0xd3, 0xd4,
		0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xe1, 0xe2,
		0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea,
		0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8,
		0xf9, 0xfa
	};

	unsigned char STD_AC_UV_NRCODES[16] = { 0, 2, 1, 2, 4, 4, 3, 4, 7, 5, 4, 4, 0, 1, 2, 0X77 };
	unsigned char STD_AC_UV_VALUES[162] =
	{
		0x00, 0x01, 0x02, 0x03, 0x11, 0x04, 0x05, 0x21,
		0x31, 0x06, 0x12, 0x41, 0x51, 0x07, 0x61, 0x71,
		0x13, 0x22, 0x32, 0x81, 0x08, 0x14, 0x42, 0x91,
		0xa1, 0xb1, 0xc1, 0x09, 0x23, 0x33, 0x52, 0xf0,
		0x15, 0x62, 0x72, 0xd1, 0x0a, 0x16, 0x24, 0x34,
		0xe1, 0x25, 0xf1, 0x17, 0x18, 0x19, 0x1a, 0x26,
		0x27, 0x28, 0x29, 0x2a, 0x35, 0x36, 0x37, 0x38,
		0x39, 0x3a, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48,
		0x49, 0x4a, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58,
		0x59, 0x5a, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68,
		0x69, 0x6a, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78,
		0x79, 0x7a, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
		0x88, 0x89, 0x8a, 0x92, 0x93, 0x94, 0x95, 0x96,
		0x97, 0x98, 0x99, 0x9a, 0xa2, 0xa3, 0xa4, 0xa5,
		0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xb2, 0xb3, 0xb4,
		0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xc2, 0xc3,
		0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xd2,
		0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda,
		0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9,
		0xea, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8,
		0xf9, 0xfa
	};

	aHuffmanTables[0].nClassAndIdentifier = 0;
	memcpy(aHuffmanTables[0].aCodes, STD_DC_Y_NRCODES, 16);
	memcpy(aHuffmanTables[0].aTable, STD_DC_Y_VALUES, 12);

	aHuffmanTables[1].nClassAndIdentifier = 1;
	memcpy(aHuffmanTables[1].aCodes, STD_DC_UV_NRCODES, 16);
	memcpy(aHuffmanTables[1].aTable, STD_DC_UV_VALUES, 12);

	aHuffmanTables[2].nClassAndIdentifier = 16;
	memcpy(aHuffmanTables[2].aCodes, STD_AC_Y_NRCODES, 16);
	memcpy(aHuffmanTables[2].aTable, STD_AC_Y_VALUES, 162);

	aHuffmanTables[3].nClassAndIdentifier = 17;
	memcpy(aHuffmanTables[3].aCodes, STD_AC_UV_NRCODES, 16);
	memcpy(aHuffmanTables[3].aTable, STD_AC_UV_VALUES, 162);

	unsigned char std_Y_QT[64] =
	{
		16, 11, 10, 16, 24, 40, 51, 61,
		12, 12, 14, 19, 26, 58, 60, 55,
		14, 13, 16, 24, 40, 57, 69, 56,
		14, 17, 22, 29, 51, 87, 80, 62,
		18, 22, 37, 56, 68, 109, 103, 77,
		24, 35, 55, 64, 81, 104, 113, 92,
		49, 64, 78, 87, 103, 121, 120, 101,
		72, 92, 95, 98, 112, 100, 103, 99
	};

	unsigned char std_UV_QT[64] =
	{
		17, 18, 24, 47, 99, 99, 99, 99,
		18, 21, 26, 66, 99, 99, 99, 99,
		24, 26, 56, 99, 99, 99, 99, 99,
		47, 66, 99, 99, 99, 99, 99, 99,
		99, 99, 99, 99, 99, 99, 99, 99,
		99, 99, 99, 99, 99, 99, 99, 99,
		99, 99, 99, 99, 99, 99, 99, 99,
		99, 99, 99, 99, 99, 99, 99, 99
	};



	aQuantizationTables[0].nPrecisionAndIdentifier = 0;
	memcpy(aQuantizationTables[0].aTable, std_Y_QT, 64);
	aQuantizationTables[1].nPrecisionAndIdentifier = 1;
	memcpy(aQuantizationTables[1].aTable, std_UV_QT, 64);

	CHECK_NPP_STATUS(cudaMemcpyAsync(pdQuantizationTables , aQuantizationTables[0].aTable, 64, cudaMemcpyHostToDevice),"");
	CHECK_NPP_STATUS(cudaMemcpyAsync(pdQuantizationTables + 64, aQuantizationTables[1].aTable, 64, cudaMemcpyHostToDevice),"");

	//填充帧头
	oFrameHeader.nSamplePrecision = 8;
	oFrameHeader.nComponents = 3;
	oFrameHeader.aComponentIdentifier[0] = 1;
	oFrameHeader.aComponentIdentifier[1] = 2;
	oFrameHeader.aComponentIdentifier[2] = 3;
	oFrameHeader.aSamplingFactors[0] = 34;
	oFrameHeader.aSamplingFactors[1] = 17;
	oFrameHeader.aSamplingFactors[2] = 17;
	oFrameHeader.aQuantizationTableSelector[0] = 0;
	oFrameHeader.aQuantizationTableSelector[1] = 1;
	oFrameHeader.aQuantizationTableSelector[2] = 1;
	oFrameHeader.nWidth = img_width;
	oFrameHeader.nHeight = img_height;

	for (int i = 0; i < oFrameHeader.nComponents; ++i)
	{
		nMCUBlocksV = max(nMCUBlocksV, oFrameHeader.aSamplingFactors[i] & 0x0f);
		nMCUBlocksH = max(nMCUBlocksH, oFrameHeader.aSamplingFactors[i] >> 4);
	}

	for (int i = 0; i < oFrameHeader.nComponents; ++i)
	{
		NppiSize oBlocks;
		NppiSize oBlocksPerMCU = { oFrameHeader.aSamplingFactors[i] >> 4, oFrameHeader.aSamplingFactors[i] & 0x0f };

		oBlocks.width = (int)ceil((oFrameHeader.nWidth + 7) / 8 *
			static_cast<float>(oBlocksPerMCU.width) / nMCUBlocksH);
		oBlocks.width = DivUp(oBlocks.width, oBlocksPerMCU.width) * oBlocksPerMCU.width;

		oBlocks.height = (int)ceil((oFrameHeader.nHeight + 7) / 8 *
			static_cast<float>(oBlocksPerMCU.height) / nMCUBlocksV);
		oBlocks.height = DivUp(oBlocks.height, oBlocksPerMCU.height) * oBlocksPerMCU.height;

		aSrcSize[i].width = oBlocks.width * 8;
		aSrcSize[i].height = oBlocks.height * 8;

		// Allocate Memory
		size_t nPitch;
		CHECK_NPP_STATUS(cudaMallocPitch(&apdDCT[i], &nPitch, oBlocks.width * 64 * sizeof(Npp16s), oBlocks.height),"");
		aDCTStep[i] = static_cast<Npp32s>(nPitch);

		CHECK_NPP_STATUS(cudaMallocPitch(&apSrcImage[i], &nPitch, aSrcSize[i].width, aSrcSize[i].height),"");

		aSrcPitch[i] = nPitch;
		aSrcImageStep[i] = static_cast<Npp32s>(nPitch);
	}

	//RGB2YUV
	cudaError_t cudaStatus;
	cudaStatus = cuda_common::RGB2YUV(d_srcRGB, img_width, img_height,
										apSrcImage[0], aSrcPitch[0], aSrcSize[0].width, aSrcSize[0].height,
										apSrcImage[1], aSrcPitch[1], aSrcSize[1].width, aSrcSize[1].height,
										apSrcImage[2], aSrcPitch[2], aSrcSize[2].width, aSrcSize[2].height);

	/**
	* Forward DCT, quantization and level shift part of the JPEG encoding.
	* Input is expected in 8x8 macro blocks and output is expected to be in 64x1
	* macro blocks. The new version of the primitive takes the ROI in image pixel size and
	* works with DCT coefficients that are in zig-zag order.
	*/
	int k = 0;
	CHECK_NPP_STATUS(nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW(apSrcImage[0], aSrcImageStep[0],
															apdDCT[0], aDCTStep[0],
															pdQuantizationTables + k * 64,
															aSrcSize[0],
															pDCTState),"");
	k = 1;
	CHECK_NPP_STATUS(nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW(apSrcImage[1], aSrcImageStep[1],
															apdDCT[1], aDCTStep[1],
															pdQuantizationTables + k * 64,
															aSrcSize[1],
															pDCTState),"");

	CHECK_NPP_STATUS(nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW(apSrcImage[2], aSrcImageStep[2],
															apdDCT[2], aDCTStep[2],
															pdQuantizationTables + k * 64,
															aSrcSize[2],
															pDCTState),"");


    // Huffman Encoding
    Npp8u *pdScan;
    Npp32s nScanLength;
    CHECK_NPP_STATUS(cudaMalloc(&pdScan, 4 << 20),"");

    Npp8u *pJpegEncoderTemp;
    size_t nTempSize;
    CHECK_NPP_STATUS(nppiEncodeHuffmanGetSize(aSrcSize[0], 3, &nTempSize),"");
    CHECK_NPP_STATUS(cudaMalloc(&pJpegEncoderTemp, nTempSize),"");

    NppiEncodeHuffmanSpec *apHuffmanDCTable[3];
    NppiEncodeHuffmanSpec *apHuffmanACTable[3];

	/**
	* Allocates memory and creates a Huffman table in a format that is suitable for the encoder.
	*/
	NppStatus t_status ;
	t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanDCTables[0].aCodes, nppiDCTable, &apHuffmanDCTable[0]);
	t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanACTables[0].aCodes, nppiACTable, &apHuffmanACTable[0]);
	t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanDCTables[1].aCodes, nppiDCTable, &apHuffmanDCTable[1]);
	t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanACTables[1].aCodes, nppiACTable, &apHuffmanACTable[1]);
	t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanDCTables[1].aCodes, nppiDCTable, &apHuffmanDCTable[2]);
	t_status = nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanACTables[1].aCodes, nppiACTable, &apHuffmanACTable[2]);

	/**
	* Huffman Encoding of the JPEG Encoding.
	* Input is expected to be 64x1 macro blocks and output is expected as byte stuffed huffman encoded JPEG scan.
	*/
	Npp32s nSs = 0;
	Npp32s nSe = 63;
	Npp32s nH = 0;
	Npp32s nL = 0;
    CHECK_NPP_STATUS(nppiEncodeHuffmanScan_JPEG_8u16s_P3R(apdDCT, aDCTStep,
													   0, nSs, nSe, nH, nL,
                                                       pdScan, &nScanLength,
                                                       apHuffmanDCTable,
                                                       apHuffmanACTable,
													   aSrcSize,
                                                       pJpegEncoderTemp),"");

    for (int i = 0; i < 3; ++i)
    {
        nppiEncodeHuffmanSpecFree_JPEG(apHuffmanDCTable[i]);
        nppiEncodeHuffmanSpecFree_JPEG(apHuffmanACTable[i]);
    }

    // Write JPEG
    unsigned char *pDstJpeg = new unsigned char[4 << 20];
    unsigned char *pDstOutput = pDstJpeg;

    writeMarker(0x0D8, pDstOutput);
    writeJFIFTag(pDstOutput);
	writeQuantizationTable(aQuantizationTables[0], pDstOutput);
	writeQuantizationTable(aQuantizationTables[1], pDstOutput);

    writeFrameHeader(oFrameHeader, pDstOutput);
    writeHuffmanTable(pHuffmanDCTables[0], pDstOutput);
    writeHuffmanTable(pHuffmanACTables[0], pDstOutput);
    writeHuffmanTable(pHuffmanDCTables[1], pDstOutput);
    writeHuffmanTable(pHuffmanACTables[1], pDstOutput);

	oScanHeader.nComponents = 3;
	oScanHeader.aComponentSelector[0] = 1;
	oScanHeader.aComponentSelector[1] = 2;
	oScanHeader.aComponentSelector[2] = 3;
	oScanHeader.aHuffmanTablesSelector[0] = 0;
	oScanHeader.aHuffmanTablesSelector[1] = 17;
	oScanHeader.aHuffmanTablesSelector[2] = 17;
	oScanHeader.nSs = 0;
	oScanHeader.nSe = 63;
	oScanHeader.nA = 0;

    writeScanHeader(oScanHeader, pDstOutput);
    CHECK_NPP_STATUS(cudaMemcpy(pDstOutput, pdScan, nScanLength, cudaMemcpyDeviceToHost),"");
    pDstOutput += nScanLength;
    writeMarker(0x0D9, pDstOutput);

    {
        // Write result to file.
		char* szOutputFile ="output.jpeg";
        std::ofstream outputFile(szOutputFile, ios::out | ios::binary);
        outputFile.write(reinterpret_cast<const char *>(pDstJpeg), static_cast<int>(pDstOutput - pDstJpeg));
    }

    // Cleanup
    delete [] pDstJpeg;

    cudaFree(pJpegEncoderTemp);
    cudaFree(pdQuantizationTables);
    cudaFree(pdScan);

    nppiDCTFree(pDCTState);

    for (int i = 0; i < 3; ++i)
    {
        cudaFree(apdDCT[i]);
        cudaFree(apSrcImage[i]);
    }

    return EXIT_SUCCESS;
}

This is my jpeg encoding code.

/*
* Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
*
* NOTICE TO USER:
*
* This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws.[img][/img]
*
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
* IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH
* REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
* OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
* OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
* OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
* OR PERFORMANCE OF THIS SOURCE CODE.
*
* U.S. Government End Users.  This source code is a "commercial item" as
* that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting  of
* "commercial computer software" and "commercial computer software
* documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
* and is provided to the U.S. Government only as a commercial end item.
* Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
* 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
* source code with only those rights set forth herein.
*/

// This sample needs at least CUDA 5.5 and a GPU that has at least Compute Capability 2.0

// This sample demonstrates a simple image processing pipeline.
// First, a JPEG file is huffman decoded and inverse DCT transformed and dequantized.
// Then the different planes are resized. Finally, the resized image is quantized, forward
// DCT transformed and huffman encoded.

#include <npp.h>
#include <cuda_runtime.h>
#include <Exceptions.h>

#include "Endianess.h"
#include <math.h>

#include <string.h>
#include <fstream>
#include <iostream>

#include <helper_string.h>
#include <helper_cuda.h>

using namespace std;

struct FrameHeader
{
    unsigned char nSamplePrecision;
    unsigned short nHeight;
    unsigned short nWidth;
    unsigned char nComponents;
    unsigned char aComponentIdentifier[3];
    unsigned char aSamplingFactors[3];
    unsigned char aQuantizationTableSelector[3];
};

struct ScanHeader
{
    unsigned char nComponents;
    unsigned char aComponentSelector[3];
    unsigned char aHuffmanTablesSelector[3];
    unsigned char nSs;
    unsigned char nSe;
    unsigned char nA;
};

struct QuantizationTable
{
    unsigned char nPrecisionAndIdentifier;
    unsigned char aTable[64];
};

struct HuffmanTable
{
    unsigned char nClassAndIdentifier;
    unsigned char aCodes[16];
    unsigned char aTable[256];
};


int DivUp(int x, int d)
{
    return (x + d - 1) / d;
}

template<typename T>
T readAndAdvance(const unsigned char *&pData)
{
    T nElement = readBigEndian<T>(pData);
    pData += sizeof(T);
    return nElement;
}

template<typename T>
void writeAndAdvance(unsigned char *&pData, T nElement)
{
    writeBigEndian<T>(pData, nElement);
    pData += sizeof(T);
}


int nextMarker(const unsigned char *pData, int &nPos, int nLength)
{
    unsigned char c = pData[nPos++];

    do
    {
        while (c != 0xffu && nPos < nLength)
        {
            c =  pData[nPos++];
        }

        if (nPos >= nLength)
            return -1;

        c =  pData[nPos++];
    }
    while (c == 0 || c == 0x0ffu);

    return c;
}

void writeMarker(unsigned char nMarker, unsigned char *&pData)
{
    *pData++ = 0x0ff;
    *pData++ = nMarker;
}

void writeJFIFTag(unsigned char *&pData)
{
    const char JFIF_TAG[] =
    {
        0x4a, 0x46, 0x49, 0x46, 0x00,
        0x01, 0x02,
        0x00,
        0x00, 0x01, 0x00, 0x01,
        0x00, 0x00
    };

    writeMarker(0x0e0, pData);
    writeAndAdvance<unsigned short>(pData, sizeof(JFIF_TAG) + sizeof(unsigned short));
    memcpy(pData, JFIF_TAG, sizeof(JFIF_TAG));
    pData += sizeof(JFIF_TAG);
}

void loadJpeg(const char *input_file, unsigned char *&pJpegData, int &nInputLength)
{
    // Load file into CPU memory
    ifstream stream(input_file, ifstream::binary);

    if (!stream.good())
    {
        return;
    }

    stream.seekg(0, ios::end);
    nInputLength = (int)stream.tellg();
    stream.seekg(0, ios::beg);

    pJpegData = new unsigned char[nInputLength];
    stream.read(reinterpret_cast<char *>(pJpegData), nInputLength);
}

void readFrameHeader(const unsigned char *pData, FrameHeader &header)
{
    readAndAdvance<unsigned short>(pData);
    header.nSamplePrecision = readAndAdvance<unsigned char>(pData);
    header.nHeight = readAndAdvance<unsigned short>(pData);
    header.nWidth = readAndAdvance<unsigned short>(pData);
    header.nComponents = readAndAdvance<unsigned char>(pData);

    for (int c=0; c<header.nComponents; ++c)
    {
        header.aComponentIdentifier[c] = readAndAdvance<unsigned char>(pData);
        header.aSamplingFactors[c] = readAndAdvance<unsigned char>(pData);
        header.aQuantizationTableSelector[c] = readAndAdvance<unsigned char>(pData);
    }

}

void writeFrameHeader(const FrameHeader &header, unsigned char *&pData)
{
    unsigned char aTemp[128];
    unsigned char *pTemp = aTemp;

    writeAndAdvance<unsigned char>(pTemp, header.nSamplePrecision);
    writeAndAdvance<unsigned short>(pTemp, header.nHeight);
    writeAndAdvance<unsigned short>(pTemp, header.nWidth);
    writeAndAdvance<unsigned char>(pTemp, header.nComponents);

    for (int c=0; c<header.nComponents; ++c)
    {
        writeAndAdvance<unsigned char>(pTemp,header.aComponentIdentifier[c]);
        writeAndAdvance<unsigned char>(pTemp,header.aSamplingFactors[c]);
        writeAndAdvance<unsigned char>(pTemp,header.aQuantizationTableSelector[c]);
    }

    unsigned short nLength = (unsigned short)(pTemp - aTemp);

    writeMarker(0x0C0, pData);
    writeAndAdvance<unsigned short>(pData, nLength + 2);
    memcpy(pData, aTemp, nLength);
    pData += nLength;
}


void readScanHeader(const unsigned char *pData, ScanHeader &header)
{
    readAndAdvance<unsigned short>(pData);

    header.nComponents = readAndAdvance<unsigned char>(pData);

    for (int c=0; c<header.nComponents; ++c)
    {
        header.aComponentSelector[c] = readAndAdvance<unsigned char>(pData);
        header.aHuffmanTablesSelector[c] = readAndAdvance<unsigned char>(pData);
    }

    header.nSs = readAndAdvance<unsigned char>(pData);
    header.nSe = readAndAdvance<unsigned char>(pData);
    header.nA = readAndAdvance<unsigned char>(pData);
}


void writeScanHeader(const ScanHeader &header, unsigned char *&pData)
{
    unsigned char aTemp[128];
    unsigned char *pTemp = aTemp;

    writeAndAdvance<unsigned char>(pTemp, header.nComponents);

    for (int c=0; c<header.nComponents; ++c)
    {
        writeAndAdvance<unsigned char>(pTemp,header.aComponentSelector[c]);
        writeAndAdvance<unsigned char>(pTemp,header.aHuffmanTablesSelector[c]);
    }

    writeAndAdvance<unsigned char>(pTemp,  header.nSs);
    writeAndAdvance<unsigned char>(pTemp,  header.nSe);
    writeAndAdvance<unsigned char>(pTemp,  header.nA);

    unsigned short nLength = (unsigned short)(pTemp - aTemp);

    writeMarker(0x0DA, pData);
    writeAndAdvance<unsigned short>(pData, nLength + 2);
    memcpy(pData, aTemp, nLength);
    pData += nLength;
}


void readQuantizationTables(const unsigned char *pData, QuantizationTable *pTables)
{
    unsigned short nLength = readAndAdvance<unsigned short>(pData) - 2;

    while (nLength > 0)
    {
        unsigned char nPrecisionAndIdentifier = readAndAdvance<unsigned char>(pData);
        int nIdentifier = nPrecisionAndIdentifier & 0x0f;

        pTables[nIdentifier].nPrecisionAndIdentifier = nPrecisionAndIdentifier;
        memcpy(pTables[nIdentifier].aTable, pData, 64);
        pData += 64;

        nLength -= 65;
    }
}

void writeQuantizationTable(const QuantizationTable &table, unsigned char *&pData)
{
    writeMarker(0x0DB, pData);
    writeAndAdvance<unsigned short>(pData, sizeof(QuantizationTable) + 2);
    memcpy(pData, &table, sizeof(QuantizationTable));
    pData += sizeof(QuantizationTable);
}

void readHuffmanTables(const unsigned char *pData, HuffmanTable *pTables)
{
    unsigned short nLength = readAndAdvance<unsigned short>(pData) - 2;

    while (nLength > 0)
    {
        unsigned char nClassAndIdentifier = readAndAdvance<unsigned char>(pData);
        int nClass = nClassAndIdentifier >> 4; // AC or DC
        int nIdentifier = nClassAndIdentifier & 0x0f;
        int nIdx = nClass * 2 + nIdentifier;
        pTables[nIdx].nClassAndIdentifier = nClassAndIdentifier;

        // Number of Codes for Bit Lengths [1..16]
        int nCodeCount = 0;

        for (int i = 0; i < 16; ++i)
        {
            pTables[nIdx].aCodes[i] = readAndAdvance<unsigned char>(pData);
            nCodeCount += pTables[nIdx].aCodes[i];
        }

        memcpy(pTables[nIdx].aTable, pData, nCodeCount);
        pData += nCodeCount;

        nLength -= 17 + nCodeCount;
    }
}

void writeHuffmanTable(const HuffmanTable &table, unsigned char *&pData)
{
    writeMarker(0x0C4, pData);

    // Number of Codes for Bit Lengths [1..16]
    int nCodeCount = 0;

    for (int i = 0; i < 16; ++i)
    {
        nCodeCount += table.aCodes[i];
    }

    writeAndAdvance<unsigned short>(pData, 17 + nCodeCount + 2);
    memcpy(pData, &table, 17 + nCodeCount);
    pData += 17 + nCodeCount;
}


void readRestartInterval(const unsigned char *pData, int &nRestartInterval)
{
    readAndAdvance<unsigned short>(pData);
    nRestartInterval = readAndAdvance<unsigned short>(pData);
}

void printHelp()
{
    cout << "jpegNPP usage" << endl;
    cout << "   -input=srcfile.jpg     (input  file JPEG image)" << endl;
    cout << "   -output=destfile.jpg   (output file JPEG image)" << endl;
    cout << "   -scale=1.0             (scale multiplier for width and height)" << endl << endl;
}

bool printfNPPinfo(int argc, char *argv[], int cudaVerMajor, int cudaVerMinor)
{
    const NppLibraryVersion *libVer   = nppGetLibVersion();

    printf("NPP Library Version %d.%d.%d\n", libVer->major, libVer->minor, libVer->build);

    int driverVersion, runtimeVersion;
    cudaDriverGetVersion(&driverVersion);
    cudaRuntimeGetVersion(&runtimeVersion);

    printf("  CUDA Driver  Version: %d.%d\n", driverVersion/1000, (driverVersion%100)/10);
    printf("  CUDA Runtime Version: %d.%d\n", runtimeVersion/1000, (runtimeVersion%100)/10);

    bool bVal = checkCudaCapabilities(cudaVerMajor, cudaVerMinor);
    return bVal;
}

int main(int argc, char **argv)
{
    // Min spec is SM 2.0 devices
    if (printfNPPinfo(argc, argv, 2, 0) == false)
    {
        cerr << "jpegNPP requires a GPU with Compute Capability 2.0 or higher" << endl;
        return EXIT_SUCCESS;
    }

    const char *szInputFile;
    const char *szOutputFile;
    float nScaleFactor;

    if ((argc == 1) || checkCmdLineFlag(argc, (const char **)argv, "help"))
    {
        printHelp();
    }

    if (checkCmdLineFlag(argc, (const char **)argv, "input"))
    {
        getCmdLineArgumentString(argc, (const char **)argv, "input", (char **)&szInputFile);
    }
    else
    {
        szInputFile = sdkFindFilePath("Growth_of_cubic_bacteria_25x16.jpg", argv[0]);
    }

    cout << "Source File: " << szInputFile << endl;

    if (checkCmdLineFlag(argc, (const char **)argv, "output"))
    {
        getCmdLineArgumentString(argc, (const char **)argv, "output", (char **)&szOutputFile);
    }
    else
    {
        szOutputFile = "scaled.jpg";
    }

    cout << "Output File: " << szOutputFile << endl;

    if (checkCmdLineFlag(argc, (const char **)argv, "scale"))
    {
        nScaleFactor = max(0.0f, min(getCmdLineArgumentFloat(argc, (const char **)argv, "scale"), 1.0f));
    }
    else
    {
        nScaleFactor = 0.5f;
    }

    cout << "Scale Factor: " << nScaleFactor << endl;

    NppiDCTState *pDCTState;
    NPP_CHECK_NPP(nppiDCTInitAlloc(&pDCTState));

    unsigned char *pJpegData = 0;
    int nInputLength = 0;

    // Load Jpeg
    loadJpeg(szInputFile, pJpegData, nInputLength);

    if (pJpegData == 0)
    {
        cerr << "Input File Error: " << szInputFile << endl;
        return EXIT_FAILURE;
    }

    /***************************
    *
    *   Input
    *
    ***************************/


    // Check if this is a valid JPEG file
    int nPos = 0;
    int nMarker = nextMarker(pJpegData, nPos, nInputLength);

    if (nMarker != 0x0D8)
    {
        cerr << "Invalid Jpeg Image" << endl;
        return EXIT_FAILURE;
    }

    nMarker = nextMarker(pJpegData, nPos, nInputLength);

    // Parsing and Huffman Decoding (on host)
    FrameHeader oFrameHeader;
    QuantizationTable aQuantizationTables[4];
    Npp8u *pdQuantizationTables;
    cudaMalloc(&pdQuantizationTables, 64 * 4);

    HuffmanTable aHuffmanTables[4];
    HuffmanTable *pHuffmanDCTables = aHuffmanTables;
    HuffmanTable *pHuffmanACTables = &aHuffmanTables[2];
    ScanHeader oScanHeader;
    memset(&oFrameHeader,0,sizeof(FrameHeader));
    memset(aQuantizationTables,0, 4 * sizeof(QuantizationTable));
    memset(aHuffmanTables,0, 4 * sizeof(HuffmanTable));
    int nMCUBlocksH = 0;
    int nMCUBlocksV = 0;

    int nRestartInterval = -1;

    NppiSize aSrcSize[3];
    Npp16s *aphDCT[3] = {0,0,0};
    Npp16s *apdDCT[3] = {0,0,0};
    Npp32s aDCTStep[3];

    Npp8u *apSrcImage[3] = {0,0,0};
    Npp32s aSrcImageStep[3];

    Npp8u *apDstImage[3] = {0,0,0};
    Npp32s aDstImageStep[3];
    NppiSize aDstSize[3];

    while (nMarker != -1)
    {
        if (nMarker == 0x0D8)
        {
            // Embedded Thumbnail, skip it
            int nNextMarker = nextMarker(pJpegData, nPos, nInputLength);

            while (nNextMarker != -1 && nNextMarker != 0x0D9)
            {
                nNextMarker = nextMarker(pJpegData, nPos, nInputLength);
            }
        }

        if (nMarker == 0x0DD)
        {
            readRestartInterval(pJpegData + nPos, nRestartInterval);
        }

        if ((nMarker == 0x0C0) | (nMarker == 0x0C2))
        {
            //Assert Baseline for this Sample
            //Note: NPP does support progressive jpegs for both encode and decode
            if (nMarker != 0x0C0)
            {
                cerr << "The sample does only support baseline JPEG images" << endl;
                return EXIT_SUCCESS;
            }

            // Baseline or Progressive Frame Header
            readFrameHeader(pJpegData + nPos, oFrameHeader);
            cout << "Image Size: " << oFrameHeader.nWidth << "x" << oFrameHeader.nHeight << "x" << static_cast<int>(oFrameHeader.nComponents) << endl;

            //Assert 3-Channel Image for this Sample
            if (oFrameHeader.nComponents != 3)
            {
                cerr << "The sample does only support color JPEG images" << endl;
                return EXIT_SUCCESS;
            }

            // Compute channel sizes as stored in the JPEG (8x8 blocks & MCU block layout)
            for (int i=0; i < oFrameHeader.nComponents; ++i)
            {
                nMCUBlocksV = max(nMCUBlocksV, oFrameHeader.aSamplingFactors[i] & 0x0f );
                nMCUBlocksH = max(nMCUBlocksH, oFrameHeader.aSamplingFactors[i] >> 4 );
            }

            for (int i=0; i < oFrameHeader.nComponents; ++i)
            {
                NppiSize oBlocks;
                NppiSize oBlocksPerMCU = { oFrameHeader.aSamplingFactors[i]  >> 4, oFrameHeader.aSamplingFactors[i] & 0x0f};

                oBlocks.width = (int)ceil((oFrameHeader.nWidth + 7)/8  *
                                          static_cast<float>(oBlocksPerMCU.width)/nMCUBlocksH);
                oBlocks.width = DivUp(oBlocks.width, oBlocksPerMCU.width) * oBlocksPerMCU.width;

                oBlocks.height = (int)ceil((oFrameHeader.nHeight+7)/8 *
                                           static_cast<float>(oBlocksPerMCU.height)/nMCUBlocksV);
                oBlocks.height = DivUp(oBlocks.height, oBlocksPerMCU.height) * oBlocksPerMCU.height;

                aSrcSize[i].width = oBlocks.width * 8;
                aSrcSize[i].height = oBlocks.height * 8;

                // Allocate Memory
                size_t nPitch;
                NPP_CHECK_CUDA(cudaMallocPitch(&apdDCT[i], &nPitch, oBlocks.width * 64 * sizeof(Npp16s), oBlocks.height));
                aDCTStep[i] = static_cast<Npp32s>(nPitch);

                NPP_CHECK_CUDA(cudaMallocPitch(&apSrcImage[i], &nPitch, aSrcSize[i].width, aSrcSize[i].height));
                aSrcImageStep[i] = static_cast<Npp32s>(nPitch);

                NPP_CHECK_CUDA(cudaHostAlloc(&aphDCT[i], aDCTStep[i] * oBlocks.height, cudaHostAllocDefault));
            }
        }

        if (nMarker == 0x0DB)
        {
            // Quantization Tables
            readQuantizationTables(pJpegData + nPos, aQuantizationTables);
        }

        if (nMarker == 0x0C4)
        {
            // Huffman Tables
            readHuffmanTables(pJpegData + nPos, aHuffmanTables);
        }

        if (nMarker == 0x0DA)
        {
            // Scan
            readScanHeader(pJpegData + nPos, oScanHeader);
            nPos += 6 + oScanHeader.nComponents * 2;

            int nAfterNextMarkerPos = nPos;
            int nAfterScanMarker = nextMarker(pJpegData, nAfterNextMarkerPos, nInputLength);

            if (nRestartInterval > 0)
            {
                while (nAfterScanMarker >= 0x0D0 && nAfterScanMarker <= 0x0D7)
                {
                    // This is a restart marker, go on
                    nAfterScanMarker = nextMarker(pJpegData, nAfterNextMarkerPos, nInputLength);
                }
            }

            NppiDecodeHuffmanSpec *apHuffmanDCTable[3];
            NppiDecodeHuffmanSpec *apHuffmanACTable[3];

            for (int i = 0; i < 3; ++i)
            {
                nppiDecodeHuffmanSpecInitAllocHost_JPEG(pHuffmanDCTables[(oScanHeader.aHuffmanTablesSelector[i] >> 4)].aCodes, nppiDCTable, &apHuffmanDCTable[i]);
                nppiDecodeHuffmanSpecInitAllocHost_JPEG(pHuffmanACTables[(oScanHeader.aHuffmanTablesSelector[i] & 0x0f)].aCodes, nppiACTable, &apHuffmanACTable[i]);
            }

            NPP_CHECK_NPP(nppiDecodeHuffmanScanHost_JPEG_8u16s_P3R(pJpegData + nPos, nAfterNextMarkerPos - nPos - 2,
                                                                   nRestartInterval, oScanHeader.nSs, oScanHeader.nSe, oScanHeader.nA >> 4, oScanHeader.nA & 0x0f,
                                                                   aphDCT,  aDCTStep,
                                                                   apHuffmanDCTable,
                                                                   apHuffmanACTable,
                                                                   aSrcSize));

            for (int i = 0; i < 3; ++i)
            {
                nppiDecodeHuffmanSpecFreeHost_JPEG(apHuffmanDCTable[i]);
                nppiDecodeHuffmanSpecFreeHost_JPEG(apHuffmanACTable[i]);
            }
        }

        nMarker = nextMarker(pJpegData, nPos, nInputLength);
    }

    // Copy DCT coefficients and Quantization Tables from host to device
    for (int i = 0; i < 4; ++i)
    {
        NPP_CHECK_CUDA(cudaMemcpyAsync(pdQuantizationTables + i * 64, aQuantizationTables[i].aTable, 64, cudaMemcpyHostToDevice));
    }

    for (int i = 0; i < 3; ++i)
    {
        NPP_CHECK_CUDA(cudaMemcpyAsync(apdDCT[i], aphDCT[i], aDCTStep[i] * aSrcSize[i].height / 8, cudaMemcpyHostToDevice));
    }

    // Inverse DCT
    for (int i = 0; i < 3; ++i)
    {
        NPP_CHECK_NPP(nppiDCTQuantInv8x8LS_JPEG_16s8u_C1R_NEW(apdDCT[i], aDCTStep[i],
                                                              apSrcImage[i], aSrcImageStep[i],
                                                              pdQuantizationTables + oFrameHeader.aQuantizationTableSelector[i] * 64,
                                                              aSrcSize[i],
                                                              pDCTState));
    }


    /***************************
    *
    *   Processing
    *
    ***************************/

    // Compute channel sizes as stored in the output JPEG (8x8 blocks & MCU block layout)
    NppiSize oDstImageSize;
    float frameWidth = floor((float)oFrameHeader.nWidth * (float)nScaleFactor);
    float frameHeight = floor((float)oFrameHeader.nHeight * (float)nScaleFactor);

    oDstImageSize.width  = (int)max(1.0f, frameWidth);
    oDstImageSize.height = (int)max(1.0f, frameHeight);

    cout << "Output Size: " << oDstImageSize.width << "x" << oDstImageSize.height << "x" << static_cast<int>(oFrameHeader.nComponents) << endl;

    for (int i=0; i < oFrameHeader.nComponents; ++i)
    {
        NppiSize oBlocks;
        NppiSize oBlocksPerMCU = { oFrameHeader.aSamplingFactors[i] & 0x0f, oFrameHeader.aSamplingFactors[i] >> 4};

        oBlocks.width = (int)ceil((oDstImageSize.width + 7)/8  *
                                  static_cast<float>(oBlocksPerMCU.width)/nMCUBlocksH);
        oBlocks.width = DivUp(oBlocks.width, oBlocksPerMCU.width) * oBlocksPerMCU.width;

        oBlocks.height = (int)ceil((oDstImageSize.height+7)/8 *
                                   static_cast<float>(oBlocksPerMCU.height)/nMCUBlocksV);
        oBlocks.height = DivUp(oBlocks.height, oBlocksPerMCU.height) * oBlocksPerMCU.height;

        aDstSize[i].width = oBlocks.width * 8;
        aDstSize[i].height = oBlocks.height * 8;

        // Allocate Memory
        size_t nPitch;
        NPP_CHECK_CUDA(cudaMallocPitch(&apDstImage[i], &nPitch, aDstSize[i].width, aDstSize[i].height));
        aDstImageStep[i] = static_cast<Npp32s>(nPitch);
    }

    // Scale to target image size
    for (int i = 0; i < 3; ++i)
    {
        NppiSize oBlocksPerMCU = { oFrameHeader.aSamplingFactors[i] & 0x0f, oFrameHeader.aSamplingFactors[i] >> 4};

        NppiSize oSrcImageSize = {(oFrameHeader.nWidth * oBlocksPerMCU.width) / nMCUBlocksH, (oFrameHeader.nHeight * oBlocksPerMCU.height)/nMCUBlocksV};
        NppiRect oSrcImageROI = {0,0,oSrcImageSize.width, oSrcImageSize.height};
        NppiRect oDstImageROI;

        NppiInterpolationMode eInterploationMode = NPPI_INTER_SUPER;

        if (nScaleFactor >= 1.f)
            eInterploationMode = NPPI_INTER_LANCZOS;

        NPP_CHECK_NPP(nppiGetResizeRect(oSrcImageROI, &oDstImageROI,
                                        nScaleFactor,
                                        nScaleFactor,
                                        0.5, 0.5, eInterploationMode));

        NPP_CHECK_NPP(nppiResizeSqrPixel_8u_C1R(apSrcImage[i], oSrcImageSize, aSrcImageStep[i], oSrcImageROI,
                                                apDstImage[i], aDstImageStep[i], oDstImageROI ,
                                                nScaleFactor,
                                                nScaleFactor,
                                                0.5, 0.5, eInterploationMode));
    }


    /***************************
    *
    *   Output
    *
    ***************************/

    
    int nComponents = 3;
    aDstSize[0].width = 800;
    aDstSize[0].height = 608;
    aDstSize[1].width = aDstSize[2].width = 400;
    aDstSize[1].height = aDstSize[2].height = 304;
    int oBlocks_width[3] = {800/8, 400/8, 400/8};
    int oBlocks_height[3] = {608/8, 304/8, 304/8};

    // Allocate Memory
    for (int i = 0; i < nComponents; ++i) {
        size_t nPitch;
        aDstSize[i] = 
        NPP_CHECK_CUDA(cudaMallocPitch(&apDstImage[i], &nPitch, aDstSize[i].width, aDstSize[i].height));
        aDstImageStep[i] = static_cast<Npp32s>(nPitch);

        NPP_CHECK_CUDA(cudaMallocPitch(&apdDCT[i], &nPitch, oBlocks_width[i] * 64 * sizeof(Npp16s), oBlocks_height[i]));
        aDCTStep[i] = static_cast<Npp32s>(nPitch);
    }

    // Forward DCT
    for (int i = 0; i < 3; ++i)
    {
        NPP_CHECK_NPP(nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R_NEW(apDstImage[i], aDstImageStep[i],
                                                              apdDCT[i], aDCTStep[i],
                                                              pdQuantizationTables + oFrameHeader.aQuantizationTableSelector[i] * 64,
                                                              aDstSize[i],
                                                              pDCTState));
    }


    // Huffman Encoding
    Npp8u *pdScan;
    Npp32s nScanLength;
    NPP_CHECK_CUDA(cudaMalloc(&pdScan, 4 << 20));

    Npp8u *pJpegEncoderTemp;
    size_t nTempSize;
    NPP_CHECK_NPP(nppiEncodeHuffmanGetSize(aSrcSize[0], 3, &nTempSize));
    NPP_CHECK_CUDA(cudaMalloc(&pJpegEncoderTemp, nTempSize));

    NppiEncodeHuffmanSpec *apHuffmanDCTable[3];
    NppiEncodeHuffmanSpec *apHuffmanACTable[3];

    for (int i = 0; i < 3; ++i)
    {
        nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanDCTables[(oScanHeader.aHuffmanTablesSelector[i] >> 4)].aCodes, nppiDCTable, &apHuffmanDCTable[i]);
        nppiEncodeHuffmanSpecInitAlloc_JPEG(pHuffmanACTables[(oScanHeader.aHuffmanTablesSelector[i] & 0x0f)].aCodes, nppiACTable, &apHuffmanACTable[i]);
    }

    NPP_CHECK_NPP(nppiEncodeHuffmanScan_JPEG_8u16s_P3R(apdDCT, aDCTStep,
                                                       0, oScanHeader.nSs, oScanHeader.nSe, oScanHeader.nA >> 4, oScanHeader.nA & 0x0f,
                                                       pdScan, &nScanLength,
                                                       apHuffmanDCTable,
                                                       apHuffmanACTable,
                                                       aDstSize,
                                                       pJpegEncoderTemp));

    for (int i = 0; i < 3; ++i)
    {
        nppiEncodeHuffmanSpecFree_JPEG(apHuffmanDCTable[i]);
        nppiEncodeHuffmanSpecFree_JPEG(apHuffmanACTable[i]);
    }

    // Write JPEG
    unsigned char *pDstJpeg = new unsigned char[4 << 20];
    unsigned char *pDstOutput = pDstJpeg;

    oFrameHeader.nWidth = oDstImageSize.width;
    oFrameHeader.nHeight = oDstImageSize.height;

    writeMarker(0x0D8, pDstOutput);
    writeJFIFTag(pDstOutput);
    writeQuantizationTable(aQuantizationTables[0], pDstOutput);
    writeQuantizationTable(aQuantizationTables[1], pDstOutput);
    writeFrameHeader(oFrameHeader, pDstOutput);
    writeHuffmanTable(pHuffmanDCTables[0], pDstOutput);
    writeHuffmanTable(pHuffmanACTables[0], pDstOutput);
    writeHuffmanTable(pHuffmanDCTables[1], pDstOutput);
    writeHuffmanTable(pHuffmanACTables[1], pDstOutput);
    writeScanHeader(oScanHeader, pDstOutput);
    NPP_CHECK_CUDA(cudaMemcpy(pDstOutput, pdScan, nScanLength, cudaMemcpyDeviceToHost));
    pDstOutput += nScanLength;
    writeMarker(0x0D9, pDstOutput);

    {
        // Write result to file.
        std::ofstream outputFile(szOutputFile, ios::out | ios::binary);
        outputFile.write(reinterpret_cast<const char *>(pDstJpeg), static_cast<int>(pDstOutput - pDstJpeg));
    }

    // Cleanup
    delete [] pJpegData;
    delete [] pDstJpeg;

    cudaFree(pJpegEncoderTemp);
    cudaFree(pdQuantizationTables);
    cudaFree(pdScan);

    nppiDCTFree(pDCTState);

    for (int i = 0; i < 3; ++i)
    {
        cudaFree(apdDCT[i]);
        cudaFreeHost(aphDCT[i]);
        cudaFree(apSrcImage[i]);
        cudaFree(apDstImage[i]);
    }

    return EXIT_SUCCESS;
}