JPEG Compression using NPP Proof of Concept

I had some issues getting the JPEG compression related functions to work due to bugs and vague documentation, but here is some proof-of-concept code that may save you some time:

/*

 The MIT License

Copyright (c) 2010 Stephen Rhein

Permission is hereby granted, free of charge, to any person obtaining a copy

 of this software and associated documentation files (the "Software"), to deal

 in the Software without restriction, including without limitation the rights

 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell

 copies of the Software, and to permit persons to whom the Software is

 furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in

 all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR

 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,

 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE

 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER

 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,

 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN

 THE SOFTWARE.

 */

/*

 * This example proof of concept has been derived from a comparable example

 * from the Intel Performance Primitives documentation at the following link:

 * http://software.intel.com/sites/products/documentation/hpc/composerxe/en-us/ippxe/ipp_manual_lnx/IPPI/ippi_ch15/functn_DCTQuantFwd8x8LS_JPEG.htm#ex15-4

 */

#include <npp.h>

#include <cuda_runtime.h>

#include <Exceptions.h>

#include <stdio.h>

#include <stdlib.h>

//Source block for DCT transform

const Npp8u src[8 * 8] = { 4, 4, 4, 4, 4, 4, 4, 4,

4, 3, 3, 3, 3, 3, 3, 4,

4, 3, 2, 2, 2, 2, 3, 4,

4, 3, 2, 1, 1, 2, 3, 4,

4, 3, 2, 1, 1, 2, 3, 4,

4, 3, 2, 2, 2, 2, 3, 4,

4, 3, 3, 3, 3, 3, 3, 4,

4, 4, 4, 4, 4, 4, 4, 4 };

//Raw Quantization Table

Npp8u pQuantRawTable[64] = { 16, 11, 12, 14, 12, 10, 16, 14,

13, 14, 18, 17, 16, 19, 24, 40,

26, 24, 22, 22, 24, 49, 35, 37,

29, 40, 58, 51, 61, 60, 57, 51,

56, 55, 64, 72, 92, 78, 64, 68,

87, 69, 55, 56, 80, 109, 81, 87,

95, 98, 103, 104, 103, 62, 77, 113,

121, 112, 100, 120, 92, 101, 103, 99 };

//Quality factor used to transform quantization table

const int quality = 75;

//Scale factor used to avoid division during quantization

const int scale = (1 << 15);

/*

 * convert_zigzag2natural[i] is the natural-order position of the i'th element

 * of zigzag order.

 */

const int convert_zigzag2natural[64] = { 0, 1, 8, 16, 9, 2, 3, 10,

17, 24, 32, 25, 18, 11, 4, 5,

12, 19, 26, 33, 40, 48, 41, 34,

27, 20, 13, 6, 7, 14, 21, 28,

35, 42, 49, 56, 57, 50, 43, 36,

29, 22, 15, 23, 30, 37, 44, 51,

58, 59, 52, 45, 38, 31, 39, 46,

53, 60, 61, 54, 47, 55, 62, 63 };

int main() {

	try {

		Npp16u pQuantFwdTable[64];

		Npp16u pQuantInvTable[64];

		Npp16s dstDCT[64];

		Npp8u dstIDCT[64];

		NppiSize invDCTroi;

		NppiSize fwdDCTroi;

		Npp8u *devSrc;

		Npp16s *devDstDCT;

		Npp8u *devDstIDCT;

		Npp16u *devPQuantFwdTable;

		Npp16u* devPQuantInvTable;

		//Allocate device memory and initialize variables

		NPP_CHECK_CUDA(cudaMalloc(&devSrc,64 * sizeof(Npp8u)));

		NPP_CHECK_CUDA(cudaMalloc(&devDstIDCT,64 * sizeof(Npp8u)));

		NPP_CHECK_CUDA(cudaMalloc(&devDstDCT,64 * sizeof(Npp16s)));

		NPP_CHECK_CUDA(cudaMalloc(&devPQuantFwdTable,64 * sizeof(Npp16u)));

		NPP_CHECK_CUDA(cudaMalloc(&devPQuantInvTable,64 * sizeof(Npp16u)));

		//Forward DCT regions of interest are pixel based

		fwdDCTroi.height = 8;

		fwdDCTroi.width = 8;

		//Inverse DCT regions of interest are coefficient based

		invDCTroi.height = 1;

		invDCTroi.width = 64;

		//Transform raw quantization table according to quality factor

		NPP_CHECK_NPP(nppiQuantFwdRawTableInit_JPEG_8u(pQuantRawTable, quality));

		//The function below has a bug, but the loop below achieves the correct result.

		//NPP_CHECK_NPP(nppiQuantFwdTableInit_JPEG_8u16u(pQuantRawTable, pQuantFwdTable));

		for (int i = 0; i < 64; ++i) {

			pQuantFwdTable[convert_zigzag2natural[i]] = (scale

					/ (double) pQuantRawTable[i]) + 0.5;

		}

		NPP_CHECK_CUDA(cudaMemcpy(devPQuantFwdTable, pQuantFwdTable,64 * sizeof(Npp16u),cudaMemcpyHostToDevice));

		//The function below has a bug, but the loop below achieves the correct result.

		//NPP_CHECK_NPP(nppiQuantInvTableInit_JPEG_8u16u(pQuantRawTable, pQuantInvTable));

		for (int i = 0; i < 64; ++i) {

			pQuantInvTable[convert_zigzag2natural[i]] = pQuantRawTable[i];

		}

		NPP_CHECK_CUDA(cudaMemcpy(devPQuantInvTable, pQuantInvTable,64 * sizeof(Npp16u),cudaMemcpyHostToDevice));

		//Copy src to device and begin transformations

		NPP_CHECK_CUDA(cudaMemcpy(devSrc,src,64 * sizeof(Npp8u),cudaMemcpyHostToDevice));

		//Perform forward DCT

		NPP_CHECK_NPP(nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R(devSrc, 8

						* sizeof(Npp8u), devDstDCT, 64 * sizeof(Npp16s), devPQuantFwdTable,

						fwdDCTroi));

		NPP_CHECK_CUDA(cudaMemcpy(dstDCT,devDstDCT,64 * sizeof(Npp16s),cudaMemcpyDeviceToHost));

		//Perform inverse DCT

		NPP_CHECK_NPP(nppiDCTQuantInv8x8LS_JPEG_16s8u_C1R(devDstDCT, 64

						* sizeof(Npp16s), devDstIDCT, 8 * sizeof(Npp8u), devPQuantInvTable,

						invDCTroi));

		NPP_CHECK_CUDA(cudaMemcpy(dstIDCT,devDstIDCT,64 * sizeof(Npp8u),cudaMemcpyDeviceToHost));

		//PRINT RESULTS

		puts("src");

		for (int i = 0; i < 8; ++i) {

			for (int j = 0; j < 8; ++j) {

				printf("%4u", src[i * 8 + j]);

			}

			puts("");

		}

		puts("");

		puts("pQuantRawTable (with quality)");

		for (int i = 0; i < 8; ++i) {

			for (int j = 0; j < 8; ++j) {

				printf("%4u", pQuantRawTable[i * 8 + j]);

			}

			puts("");

		}

		puts("");

		puts("pQuantFwdTable");

		for (int i = 0; i < 8; ++i) {

			for (int j = 0; j < 8; ++j) {

				printf("%6u", pQuantFwdTable[i * 8 + j]);

			}

			puts("");

		}

		puts("");

		puts("pQuantInvTable");

		for (int i = 0; i < 8; ++i) {

			for (int j = 0; j < 8; ++j) {

				printf("%6u", pQuantInvTable[i * 8 + j]);

			}

			puts("");

		}

		puts("");

		puts("dstDCT");

		for (int i = 0; i < 8; ++i) {

			for (int j = 0; j < 8; ++j) {

				printf("%7d", dstDCT[i * 8 + j]);

			}

			puts("");

		}

		puts("");

		puts("dstIDCT");

		for (int i = 0; i < 8; ++i) {

			for (int j = 0; j < 8; ++j) {

				printf("%4d", dstIDCT[i * 8 + j]);

			}

			puts("");

		}

		puts("Lossyness");

		for (int i = 0; i < 8; ++i) {

			for (int j = 0; j < 8; ++j) {

				printf("%5d", src[i * 8 + j] - dstIDCT[i * 8 + j]);

			}

			puts("");

		}

	} catch (npp::Exception e) {

		printf("%s\n", e.toString().c_str());

		return EXIT_FAILURE;

	}

	return EXIT_SUCCESS;

}

Thanks so much for documenting your work! :)

My pleasure, I’m glad it was still useful after 7.5 years :)

Hi PapaSmurf007, is it possible for me to do jpeg encoding in C? I have already convert RGB buffer to YUV buffer, and now i want to encode the YUV buffer as jpeg so I can save the image. Thanks in advance