Using nppiMean_StdDev_8u_C1R after setNppStream returns NPP_RANGE_ERROR

Hi,

I’m trying to execute an npp function in multiple streams in parallel.
However, in the following example code, nppiMean_StdDev_8u_C1R returns NPP_RANGE_ERROR.
This example code creates two threads, and each thread tries to run nppiMean_StdDev_8u_C1R.
Each thread creates a stream and uses nppSetStream to set the cuda stream which the npp function should run on.

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "npp.h"
#include "nppi.h"

#include <omp.h>
#include <opencv2/opencv.hpp>
int main()
{
    #pragma omp parallel num_threads(2)
    {
        // Source image : 512x512 1channel 8bit, all pixels set to 5
        GpuMat src(512, 512, CV_8UC1);
        src.setTo(Scalar(5));

        // Destination buffer where mean and stddev value is stored
        GpuMat dst(1, 2, CV_64FC1);

        // Create scratch buffer
        int bufSize;
        NppiSize sz;
        sz.width = src.cols;
        sz.height = src.rows;
        nppSafeCall( nppiMeanStdDevGetBufferHostSize_8u_C1R(sz, &bufSize) );
        GpuMat buf(1, bufSize, CV_8UC1);
        
        // Create stream
        cudaStream_t stream;
        cudaStreamCreate(&stream);

        // Set npp to use this stream
        nppSetStream(stream);

        nppSafeCall( nppiMean_StdDev_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step), sz, buf.ptr<Npp8u>(), dst.ptr<Npp64f>(), dst.ptr<Npp64f>() + 1) );

        // Wait until npp call finish
        cudaStreamSynchronize(stream);

        // Destroy stream
        cudaStreamDestroy(stream);

        // Print output (expects mean = 5, stddev = 0)
        Mat h_dst;
        dst.download(h_dst);
        std::string out = "thread" + std::to_string(omp_get_thread_num()) + " : (mean)" + std::to_string(h_dst.at<Npp64f>(0, 0)) + " (stddev)" + std::to_string(h_dst.at<Npp64f>(0, 1)) + "\n";
        std::cout << out;
    }
}

When I comment out line 32( nppSetStream(stream) ), the program ends with correct answer.

So my question is :
Is it not safe to use the npp function in multiple streams with setNppStream?

Thank you.

CUDA version : 8.0, 9.1
OS : Windows10
GPU : GTX 1080
Driver version : 388.19

Update:

I’ve found this note on the npp documentation that says:

I tried calling cudaStreamSynchronize like the following code but it also returned NPP_RANGE_ERROR.

cudaStream_t oldStream = nppGetStream();
cudaStreamSynchronize(oldStream);
nppSetStream(newStream);

Calling cudaDeviceSynchronize also didn’t work.

Nice investigation, devnglee,
I made the source code more shorter.
Now the sample code which crashes on my laptop doesn’t rely on OpenCV at all.
It’s purely calling CUDA and NPP functions.

Am I doing something wrong ?

I hope someone from nvidia can give me a hand about this point.

int main()
{
    #pragma omp parallel num_threads(2)
    {
        int status = 0;
        // Source image : 512x512 1channel 8bit, all pixels set to 5
        unsigned char *ptrSrc;
        size_t pitch = 512;
        size_t width = pitch;
        size_t height = width;
        cudaSafeCall( cudaMallocPitch((void**)&ptrSrc, &pitch, width, height), status );

        unsigned char *ptrPool = NULL;
        cudaSafeCall( cudaMalloc((void**)&ptrPool, width*height * 200), status );
        cudaSafeCall( cudaMemset2D((void**)&ptrSrc, pitch, 5, width, height), status );

        Npp64f *ptrDst = NULL;
        cudaSafeCall(cudaMalloc((void**)&ptrDst, sizeof(double) * 1 * 2), status);

        // Create scratch buffer
        int bufSize;
        NppiSize sz;
        sz.width = (int)width;
        sz.height = (int)height;
        nppSafeCall( nppiMeanStdDevGetBufferHostSize_8u_C1R(sz, &bufSize), status );

        unsigned char *ptrBuffer = NULL;
        cudaSafeCall( cudaMalloc((void**)&ptrBuffer, (size_t)(sizeof(char)*bufSize)), status );
        
        // Create stream
        cudaStream_t stream;
        cudaSafeCall( cudaStreamCreate(&stream), status );

        // Set npp to use this stream
        nppSetStream(stream);

        nppSafeCall( nppiMean_StdDev_8u_C1R(ptrSrc, (int)pitch, sz, ptrBuffer, ptrDst, ptrDst + 1), status );

        // Wait until npp call finish
        cudaSafeCall( cudaStreamSynchronize(stream), status );

        // Destroy stream
        cudaSafeCall( cudaStreamDestroy(stream), status );

        std::cout << "thread" << std::to_string(omp_get_thread_num()) << " status : " << status << std::endl;
    }
    return 0;
}