I had some issues getting the JPEG compression related functions to work due to bugs and vague documentation, but here is some proof-of-concept code that may save you some time:
/*
The MIT License
Copyright (c) 2010 Stephen Rhein
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
/*
* This example proof of concept has been derived from a comparable example
* from the Intel Performance Primitives documentation at the following link:
* http://software.intel.com/sites/products/documentation/hpc/composerxe/en-us/ippxe/ipp_manual_lnx/IPPI/ippi_ch15/functn_DCTQuantFwd8x8LS_JPEG.htm#ex15-4
*/
#include <npp.h>
#include <cuda_runtime.h>
#include <Exceptions.h>
#include <stdio.h>
#include <stdlib.h>
//Source block for DCT transform
const Npp8u src[8 * 8] = { 4, 4, 4, 4, 4, 4, 4, 4,
4, 3, 3, 3, 3, 3, 3, 4,
4, 3, 2, 2, 2, 2, 3, 4,
4, 3, 2, 1, 1, 2, 3, 4,
4, 3, 2, 1, 1, 2, 3, 4,
4, 3, 2, 2, 2, 2, 3, 4,
4, 3, 3, 3, 3, 3, 3, 4,
4, 4, 4, 4, 4, 4, 4, 4 };
//Raw Quantization Table
Npp8u pQuantRawTable[64] = { 16, 11, 12, 14, 12, 10, 16, 14,
13, 14, 18, 17, 16, 19, 24, 40,
26, 24, 22, 22, 24, 49, 35, 37,
29, 40, 58, 51, 61, 60, 57, 51,
56, 55, 64, 72, 92, 78, 64, 68,
87, 69, 55, 56, 80, 109, 81, 87,
95, 98, 103, 104, 103, 62, 77, 113,
121, 112, 100, 120, 92, 101, 103, 99 };
//Quality factor used to transform quantization table
const int quality = 75;
//Scale factor used to avoid division during quantization
const int scale = (1 << 15);
/*
* convert_zigzag2natural[i] is the natural-order position of the i'th element
* of zigzag order.
*/
const int convert_zigzag2natural[64] = { 0, 1, 8, 16, 9, 2, 3, 10,
17, 24, 32, 25, 18, 11, 4, 5,
12, 19, 26, 33, 40, 48, 41, 34,
27, 20, 13, 6, 7, 14, 21, 28,
35, 42, 49, 56, 57, 50, 43, 36,
29, 22, 15, 23, 30, 37, 44, 51,
58, 59, 52, 45, 38, 31, 39, 46,
53, 60, 61, 54, 47, 55, 62, 63 };
int main() {
try {
Npp16u pQuantFwdTable[64];
Npp16u pQuantInvTable[64];
Npp16s dstDCT[64];
Npp8u dstIDCT[64];
NppiSize invDCTroi;
NppiSize fwdDCTroi;
Npp8u *devSrc;
Npp16s *devDstDCT;
Npp8u *devDstIDCT;
Npp16u *devPQuantFwdTable;
Npp16u* devPQuantInvTable;
//Allocate device memory and initialize variables
NPP_CHECK_CUDA(cudaMalloc(&devSrc,64 * sizeof(Npp8u)));
NPP_CHECK_CUDA(cudaMalloc(&devDstIDCT,64 * sizeof(Npp8u)));
NPP_CHECK_CUDA(cudaMalloc(&devDstDCT,64 * sizeof(Npp16s)));
NPP_CHECK_CUDA(cudaMalloc(&devPQuantFwdTable,64 * sizeof(Npp16u)));
NPP_CHECK_CUDA(cudaMalloc(&devPQuantInvTable,64 * sizeof(Npp16u)));
//Forward DCT regions of interest are pixel based
fwdDCTroi.height = 8;
fwdDCTroi.width = 8;
//Inverse DCT regions of interest are coefficient based
invDCTroi.height = 1;
invDCTroi.width = 64;
//Transform raw quantization table according to quality factor
NPP_CHECK_NPP(nppiQuantFwdRawTableInit_JPEG_8u(pQuantRawTable, quality));
//The function below has a bug, but the loop below achieves the correct result.
//NPP_CHECK_NPP(nppiQuantFwdTableInit_JPEG_8u16u(pQuantRawTable, pQuantFwdTable));
for (int i = 0; i < 64; ++i) {
pQuantFwdTable[convert_zigzag2natural[i]] = (scale
/ (double) pQuantRawTable[i]) + 0.5;
}
NPP_CHECK_CUDA(cudaMemcpy(devPQuantFwdTable, pQuantFwdTable,64 * sizeof(Npp16u),cudaMemcpyHostToDevice));
//The function below has a bug, but the loop below achieves the correct result.
//NPP_CHECK_NPP(nppiQuantInvTableInit_JPEG_8u16u(pQuantRawTable, pQuantInvTable));
for (int i = 0; i < 64; ++i) {
pQuantInvTable[convert_zigzag2natural[i]] = pQuantRawTable[i];
}
NPP_CHECK_CUDA(cudaMemcpy(devPQuantInvTable, pQuantInvTable,64 * sizeof(Npp16u),cudaMemcpyHostToDevice));
//Copy src to device and begin transformations
NPP_CHECK_CUDA(cudaMemcpy(devSrc,src,64 * sizeof(Npp8u),cudaMemcpyHostToDevice));
//Perform forward DCT
NPP_CHECK_NPP(nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R(devSrc, 8
* sizeof(Npp8u), devDstDCT, 64 * sizeof(Npp16s), devPQuantFwdTable,
fwdDCTroi));
NPP_CHECK_CUDA(cudaMemcpy(dstDCT,devDstDCT,64 * sizeof(Npp16s),cudaMemcpyDeviceToHost));
//Perform inverse DCT
NPP_CHECK_NPP(nppiDCTQuantInv8x8LS_JPEG_16s8u_C1R(devDstDCT, 64
* sizeof(Npp16s), devDstIDCT, 8 * sizeof(Npp8u), devPQuantInvTable,
invDCTroi));
NPP_CHECK_CUDA(cudaMemcpy(dstIDCT,devDstIDCT,64 * sizeof(Npp8u),cudaMemcpyDeviceToHost));
//PRINT RESULTS
puts("src");
for (int i = 0; i < 8; ++i) {
for (int j = 0; j < 8; ++j) {
printf("%4u", src[i * 8 + j]);
}
puts("");
}
puts("");
puts("pQuantRawTable (with quality)");
for (int i = 0; i < 8; ++i) {
for (int j = 0; j < 8; ++j) {
printf("%4u", pQuantRawTable[i * 8 + j]);
}
puts("");
}
puts("");
puts("pQuantFwdTable");
for (int i = 0; i < 8; ++i) {
for (int j = 0; j < 8; ++j) {
printf("%6u", pQuantFwdTable[i * 8 + j]);
}
puts("");
}
puts("");
puts("pQuantInvTable");
for (int i = 0; i < 8; ++i) {
for (int j = 0; j < 8; ++j) {
printf("%6u", pQuantInvTable[i * 8 + j]);
}
puts("");
}
puts("");
puts("dstDCT");
for (int i = 0; i < 8; ++i) {
for (int j = 0; j < 8; ++j) {
printf("%7d", dstDCT[i * 8 + j]);
}
puts("");
}
puts("");
puts("dstIDCT");
for (int i = 0; i < 8; ++i) {
for (int j = 0; j < 8; ++j) {
printf("%4d", dstIDCT[i * 8 + j]);
}
puts("");
}
puts("Lossyness");
for (int i = 0; i < 8; ++i) {
for (int j = 0; j < 8; ++j) {
printf("%5d", src[i * 8 + j] - dstIDCT[i * 8 + j]);
}
puts("");
}
} catch (npp::Exception e) {
printf("%s\n", e.toString().c_str());
return EXIT_FAILURE;
}
return EXIT_SUCCESS;
}