Along the same lines as my initial bug report, I found a problem with
nppiQuantInvTableInit_JPEG_8u16u
. Ironically, it produces “correct” output when used in conjunction with the bugged
nppiQuantFwdTableInit_JPEG_8u16u
, but produces incorrect output otherwise:
#include <npp.h>
#include <cuda_runtime.h>
#include <Exceptions.h>
#include <stdio.h>
#include <stdlib.h>
int main(int argc, char** argv) {
try {
Npp8u pQuantRawTable[64] = { 16, 11, 12, 14, 12, 10, 16, 14,
13, 14, 18, 17, 16, 19, 24, 40,
26, 24, 22, 22, 24, 49, 35, 37,
29, 40, 58, 51, 61, 60, 57, 51,
56, 55, 64, 72, 92, 78, 64, 68,
87, 69, 55, 56, 80, 109, 81, 87,
95, 98, 103, 104, 103, 62, 77, 113,
121, 112, 100, 120, 92, 101, 103, 99 };
Npp8u src[8 * 8] = { 4, 4, 4, 4, 4, 4, 4, 4,
4, 3, 3, 3, 3, 3, 3, 4,
4, 3, 2, 2, 2, 2, 3, 4,
4, 3, 2, 1, 1, 2, 3, 4,
4, 3, 2, 1, 1, 2, 3, 4,
4, 3, 2, 2, 2, 2, 3, 4,
4, 3, 3, 3, 3, 3, 3, 4,
4, 4, 4, 4, 4, 4, 4, 4 };
Npp16u pQuantFwdTable[64] = { 4096, 5461, 6554, 4096, 2731, 1638, 1260,
1057,
5461, 5461, 4681, 3277, 2521, 1130, 1092, 1170,
4681, 4681, 4096, 2731, 1638, 1130, 936, 1170,
4681, 3641, 2979, 2185, 1260, 745, 819, 1057,
3641, 2979, 1725, 1170, 964, 596, 630, 840,
2731, 1820, 1170, 1024, 799, 630, 575, 712,
1311, 1024, 840, 745, 630, 537, 546, 643,
910, 712, 683, 669, 585, 655, 630, 655 };
Npp16u pQuantInvTable[64];
Npp16s dst[64];
Npp8u *devSrc;
Npp16s *devDst;
Npp16u *devPQuantFwdTable;
Npp16u* devPQuantInvTable;
int quality = 75;
NppiSize roi;
NppiSize InvROI;
puts("src");
for (int i = 0; i < 8; ++i) {
for (int j = 0; j < 8; ++j) {
printf("%5hd", src[i * 8 + j]);
}
puts("");
}
puts("");
NPP_CHECK_CUDA(cudaMalloc(&devSrc,64 * sizeof(Npp8u)));
NPP_CHECK_CUDA(cudaMemcpy(devSrc,src,64 * sizeof(Npp8u),cudaMemcpyHostToDevice));
NPP_CHECK_CUDA(cudaMalloc(&devDst,64 * sizeof(Npp16s)));
NPP_CHECK_CUDA(cudaMalloc(&devPQuantFwdTable,64 * sizeof(Npp16u)));
NPP_CHECK_CUDA(cudaMalloc(&devPQuantInvTable,64 * sizeof(Npp16u)));
roi.height = 8;
roi.width = 8;
InvROI.height = 1;
InvROI.width = 64;
NPP_CHECK_NPP(nppiQuantFwdRawTableInit_JPEG_8u(pQuantRawTable, quality));
//If the line below is uncommented, inverse DCT appears accurate, but with the correct forward quant table, it does not
//NPP_CHECK_NPP(nppiQuantFwdTableInit_JPEG_8u16u(pQuantRawTable, pQuantFwdTable));
NPP_CHECK_NPP(nppiQuantInvTableInit_JPEG_8u16u(pQuantRawTable, pQuantInvTable));
NPP_CHECK_CUDA(cudaMemcpy(devPQuantFwdTable, pQuantFwdTable,64 * sizeof(Npp16u),cudaMemcpyHostToDevice));
NPP_CHECK_CUDA(cudaMemcpy(devPQuantInvTable, pQuantInvTable,64 * sizeof(Npp16u),cudaMemcpyHostToDevice));
NPP_CHECK_NPP(nppiDCTQuantFwd8x8LS_JPEG_8u16s_C1R(devSrc, 8
* sizeof(Npp8u), devDst, 64 * sizeof(Npp16s), devPQuantFwdTable,
roi));
NPP_CHECK_NPP(nppiDCTQuantInv8x8LS_JPEG_16s8u_C1R(devDst, 64
* sizeof(Npp16s), devSrc, 8 * sizeof(Npp8u), devPQuantInvTable,
InvROI));
NPP_CHECK_CUDA(cudaMemcpy(dst,devDst,64 * sizeof(Npp16s),cudaMemcpyDeviceToHost));
NPP_CHECK_CUDA(cudaMemcpy(src,devSrc,64 * sizeof(Npp8u),cudaMemcpyDeviceToHost));
//PRINT RESULTS
puts("pQuantRawTable (with quality)");
for (int i = 0; i < 8; ++i) {
for (int j = 0; j < 8; ++j) {
printf("%3u", pQuantRawTable[i * 8 + j]);
}
puts("");
}
puts("");
puts("pQuantFwdTable");
for (int i = 0; i < 8; ++i) {
for (int j = 0; j < 8; ++j) {
printf("%5hu", pQuantFwdTable[i * 8 + j]);
}
puts("");
}
puts("");
puts("pQuantInvTable");
for (int i = 0; i < 8; ++i) {
for (int j = 0; j < 8; ++j) {
printf("%5hu", pQuantInvTable[i * 8 + j]);
}
puts("");
}
puts("");
puts("dst");
for (int i = 0; i < 8; ++i) {
for (int j = 0; j < 8; ++j) {
printf("%5hd", dst[i * 8 + j]);
}
puts("");
}
puts("");
puts("src");
for (int i = 0; i < 8; ++i) {
for (int j = 0; j < 8; ++j) {
printf("%5hd", src[i * 8 + j]);
}
puts("");
}
} catch (npp::Exception e) {
printf("%s\n", e.toString().c_str());
return EXIT_FAILURE;
}
return EXIT_SUCCESS;
}