#define NUM_ELEMENTS 10 class nppClass { public: void CalcMeanSTD(); void IncArrayElemnts(); void UploadData(); nppClass(); ~nppClass(); private: float* deviceMean; float* deviceStd; Npp8u* m_devRequiredBufferForMeanStd; cudaStream_t m_cudaStream; std::vector hostVec = { 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f }; float* deviceArray; }; nppClass::nppClass() { cudaStreamCreateWithFlags(&m_cudaStream, cudaStreamNonBlocking); nppSetStream(m_cudaStream); int deviceScratchBufferSize; //Get size of temporal buffer required for NPP API NPP(nppsMeanStdDevGetBufferSize_32f(NUM_ELEMENTS, &deviceScratchBufferSize)); //if (bufferStatus != NPP_SUCCESS) //std::cout << "Problem.." << std::endl; //Allocate temporal buffer on device m_devRequiredBufferForMeanStd = nppsMalloc_8u(deviceScratchBufferSize); //Allocate memory for mean result buffer cudaMalloc(reinterpret_cast(&deviceMean), NUM_ELEMENTS * sizeof(float)); //Allocate memory for STD result buffer cudaMalloc(reinterpret_cast(&deviceStd), NUM_ELEMENTS * sizeof(float)); //Allcoate memory for device array cudaMalloc(reinterpret_cast(&deviceArray), NUM_ELEMENTS * sizeof(float)); } nppClass::~nppClass() { //nppSetStream(0); cudaFree(deviceMean); cudaFree(deviceStd); cudaFree(deviceArray); cudaFree(m_devRequiredBufferForMeanStd); cudaStreamDestroy(m_cudaStream); } void nppClass::CalcMeanSTD() { Npp32f* pMean = reinterpret_cast(deviceMean); int deviceScratchBufferSize; Npp32f* destPtr = reinterpret_cast(deviceArray); NPP(nppsMeanStdDev_32f( destPtr, NUM_ELEMENTS, pMean, deviceStd, m_devRequiredBufferForMeanStd)); cudaStreamSynchronize(m_cudaStream); std::vector hostMean(NUM_ELEMENTS); std::vector hostStd(NUM_ELEMENTS); //Download results to host cudaMemcpyAsync( hostMean.data(), deviceMean, NUM_ELEMENTS * sizeof(float), cudaMemcpyDeviceToHost, m_cudaStream); cudaMemcpyAsync( hostStd.data(), deviceStd, NUM_ELEMENTS * sizeof(float), cudaMemcpyDeviceToHost, m_cudaStream); cudaStreamSynchronize(m_cudaStream); std::cout << "Mean = " << hostMean[0] << std::endl; std::cout << "STD = " << hostStd[0] << std::endl; } void nppClass::IncArrayElemnts() { for (size_t i = 0; i < hostVec.size(); i++) { hostVec[i] += 1.0f; } } void nppClass::UploadData() { //Upload host Array Data to Device cudaMemcpyAsync( deviceArray, hostVec.data(), NUM_ELEMENTS * sizeof(float), cudaMemcpyHostToDevice, m_cudaStream); cudaStreamSynchronize(m_cudaStream); } #define MY_NPP_DEBUG #ifdef MY_NPP_DEBUG #define NPP_CHECK_ERROR(stmt, fname, line)\ {\ NppStatus nppStatus;\ nppStatus = (stmt);\ if (nppStatus != NppStatus::NPP_SUCCESS)\ {\ std::cout << "File: " << fname << std::endl\ << "Line: " << line << std::endl\ << "statement: " << #stmt << std::endl\ << "NPP error - " << nppStatus << std::endl;\ \ getchar();\ }\ else\ {\ std::cout << "File: " << fname << std::endl\ << "Line: " << line << std::endl\ << "statement: " << #stmt << std::endl\ << "NPP error - " << nppStatus << std::endl;\ std::cout << "NPP No error!!!" << std::endl;\ }\ }; #define NPP(stmt) do {\ stmt;\ NPP_CHECK_ERROR(stmt, __FILE__, __LINE__);\ }while(0) #else #define NPP(stmt) stmt #endif /*MY_NPP_DEBUG*/ void main() { std::unique_ptr nppClass1Ptr = std::make_unique(); nppClass1Ptr->UploadData(); nppClass1Ptr->CalcMeanSTD(); nppClass1Ptr = nullptr; std::unique_ptr nppClass2Ptr = std::make_unique(); nppClass2Ptr->IncArrayElemnts(); nppClass2Ptr->UploadData(); nppClass2Ptr->CalcMeanSTD(); }