I tried to compress my images using the NPP library, which I developed on Jetson ORIN64G, using cuda-11.4.
I tried to use the RESZIE function, initially using 1024 * 768 data and changing it to 512 * 768. It works, but I actually need to compress the 32000 * 121 data to 2048 * 121, but it outputs a value of 0, and it doesn’t. My code is as follows
#include <stdio.h>
#include <stdlib.h>
#include <npp.h>
#include <cuda_runtime.h>
#include <nppi.h>
#include <nppdefs.h>
#include <iostream>
using namespace std;
#define NPP_INTER_LINEAR 2
//#include<nppdef.h>
// Function to resize a 2D matrix using NPP
float* decimate_cuda(float* readbuff, int nSrcH, int nSrcW, int nDstH, int nDstW)
{
size_t srcStep;
size_t dstStep;
NppiSize oSrcSize = {nSrcW, nSrcH};
NppiRect oSrcROI = {0, 0, nSrcW, nSrcH};
float *devSrc;
cudaMallocPitch((void**)&devSrc, &srcStep, nSrcW * sizeof(float), nSrcH);
cudaMemcpy2D(devSrc, srcStep,readbuff, nSrcW * sizeof(Npp32f), nSrcW * sizeof(Npp32f), nSrcH, cudaMemcpyHostToDevice);
NppiSize oDstSize = {nDstW, nDstH};
NppiRect oDstROI = {0, 0, nDstW, nDstH};
float *devDst;
cudaMallocPitch((void**)&devDst, &dstStep, nDstW * sizeof(float), nDstH);
printf("nSrcW=%d srcStep=%d\n",nSrcW,srcStep);
NppStatus result = nppiResize_32f_C1R(devSrc,srcStep,oSrcSize,oSrcROI,devDst,dstStep,oDstSize,oDstROI,NPPI_INTER_SUPER);
if (result != NPP_SUCCESS) {
std::cout << "Unable to run decimate_cuda, error " << result << std::endl;
exit(0);
}
Npp64s writesize;
Npp32f *hostDst;
writesize = (Npp64s) nDstW * nDstH; // Y
if(NULL == (hostDst = (Npp32f *)malloc(writesize * sizeof(Npp32f)))){
printf("Error : Unable to alloctae hostDst in decimate_cuda, exiting...\n");
exit(1);
}
cudaMemcpy2D(hostDst, nDstW * sizeof(Npp32f),devDst, dstStep, nDstW * sizeof(Npp32f),nDstH, cudaMemcpyDeviceToHost);
cudaFree(devSrc);
cudaFree(devDst);
return(hostDst);
}
int main() {
const int width = 1030;
const int height = 513;
// Allocate memory for the input and output matrices
float* inputMatrix = (float*)malloc(width * height * sizeof(float));
float* outputMatrix = (float*)malloc(width * height * sizeof(float));
// Initialize the input matrix with some data
// (You can replace this with your own data)
for (int i = 0; i < height; ++i)
{
for(int j=0;j<width;j++)
inputMatrix[i*width+j] = j;
}
// Specify the desired output size (e.g., half of the input size)
const int outputWidth = width / 2;
const int outputHeight = height ;
// Call the resize function with OpenACC
#pragma acc enter data copyin(inputMatrix[0:width*height])
outputMatrix=decimate_cuda(inputMatrix, height, width,outputWidth, outputHeight);
for(int i=0;i<20;i++)
printf("input[%d]=%f, outputMatrix[%d]=%f\n",i,inputMatrix[i],i,outputMatrix[i]);
// Free allocated memory
free(inputMatrix);
free(outputMatrix);
return 0;
}