cuRAND 8.0: Bugs in curandSetGeneratorOffset() with PHILOX4_32_10 and curandGenerateSeeds() with MT19937 ?

Hi,

I’ve noticed a couple of problems with cuRAND when trying CPU/GPU reproducibility, sequence offset and general performance. I’m using cuRAND 8.0.61 on Windows.

  • Using curandSetGeneratorOffset() with PHILOX4_32_10 seems to return back to the beginning of the sequence no matter the offset. MRG32K3A and XORWOW work fine. See the "ERROR: comparison 8000 failed" message when running the following program (source to follow).
  • Using curandGenerateSeeds() with MT19937 CPU takes about 20-30 seconds (?!), surely that's not right?

Is there anything obvious I’m missing here?

Regards,
Tanguy

#include <cuda.h>
#include <curand.h>
#include <cuda_runtime.h>

#include <chrono>
#include <iostream>
#include <stdexcept>
#include <string>
#include <sstream>
#include <vector>

using namespace std::chrono;


// This will output the proper CUDA error strings in the event that a CUDA host call returns an error
#define CheckCudaError(val)           __checkCudaError ( (val), #val, __FILE__, __LINE__ )
#define CheckCuRandError(val)         __checkCuRandError ( (val), #val, __FILE__, __LINE__ )


class CudaException : public std::runtime_error
{
public:
	CudaException(cudaError_t cudaError, const std::string & msg) :
		runtime_error(msg),
		CudaError(cudaError) { }

	const cudaError_t CudaError;
};


class CuRandException : public std::runtime_error
{
public:
	CuRandException(curandStatus_t cuRandError, const std::string & msg) :
		runtime_error(msg),
		CuRandError(cuRandError) { }

	const curandStatus_t CuRandError;
};


inline void __checkCudaError(cudaError_t result, char const *const func, const char *const file, int const line)
{
	if (result)
	{
		std::ostringstream out;

		out << "CUDA error at "
			<< file << ":" << line
			<< " code=" << static_cast<unsigned int>(result)
			<< " (" << cudaGetErrorString(result) << ") "
			<< "'" << func << "'";

		throw CudaException(result, out.str());
	}
}


inline void __checkCuRandError(curandStatus_t result, char const *const func, const char *const file, int const line)
{
	if (result)
	{
		std::ostringstream out;

		out << "cuRAND error at "
			<< file << ":" << line
			<< " code=" << static_cast<unsigned int>(result)
			<< "'" << func << "'";

		throw CuRandException(result, out.str());
	}
}


class CuRandom
{
public:

	CuRandom(bool cpu, curandRngType_t rngType) :
		cpu_(cpu),
		hBuffer_(BufferSize),
		pos_(BufferSize)
	{
		CheckCudaError(cudaMalloc(&dBuffer_, BufferSize * sizeof(float)));
		CheckCuRandError(cpu ? curandCreateGeneratorHost(&generator_, rngType) : curandCreateGenerator(&generator_, rngType));

		// Precompute the seeds and time it separately.
		{
			auto begin = high_resolution_clock::now();

			CheckCuRandError(curandGenerateSeeds(generator_));

			auto end = high_resolution_clock::now();
			auto elapsed = duration_cast<nanoseconds>(end - begin).count();
			std::cout << "Generated " << (cpu_ ? "CPU" : "GPU") << " seeds in " << elapsed / (1000.0*1000.0) << "ms" << std::endl;
		}
	}

	~CuRandom()
	{
		CheckCuRandError(curandDestroyGenerator(generator_));
		CheckCudaError(cudaFree(dBuffer_));
	}

	float NextFloat()
	{
		if (pos_ >= BufferSize)
		{
			RefillBuffer();
			pos_ = 0;
		}

		return hBuffer_[pos_++];
	}

	void SetOffset(unsigned long long offset)
	{
		auto begin = high_resolution_clock::now();

		CheckCuRandError(curandSetGeneratorOffset(generator_, offset));
		pos_ = BufferSize;

		auto end = high_resolution_clock::now();
		auto elapsed = duration_cast<nanoseconds>(end - begin).count();
		std::cout << "Set " << (cpu_ ? "CPU" : "GPU") << "offset " << offset << " in " << elapsed / (1000.0*1000.0) << "ms" << std::endl;
	}

private:

	void RefillBuffer()
	{
		auto begin = high_resolution_clock::now();

		if (cpu_)
		{
			CheckCuRandError(curandGenerateUniform(generator_, hBuffer_.data(), BufferSize));
		}
		else
		{
			CheckCuRandError(curandGenerateUniform(generator_, dBuffer_, BufferSize));
			CheckCudaError(cudaMemcpy(hBuffer_.data(), dBuffer_, BufferSize * sizeof(float), cudaMemcpyDeviceToHost));
		}

		auto end = high_resolution_clock::now();
		auto elapsed = duration_cast<nanoseconds>(end - begin).count();
		std::cout << "Refilled " << (cpu_ ? "CPU" : "GPU") << "buffer in " << elapsed / (1000.0*1000.0) << "ms" << std::endl;
	}

	const size_t BufferSize = 8 * 1024 * 1024;

	const bool cpu_;
	curandGenerator_t generator_;
	std::vector<float> hBuffer_;
	float* dBuffer_;
	size_t pos_;
};


void RunComparisons();
void RunComparison(curandRngType_t rngType, const char* name, size_t numComparisons);


int main(int argc, char *argv[])
{
	try
	{
		int cuRandVersion;
		CheckCuRandError(curandGetVersion(&cuRandVersion));
		std::cout << "cuRAND version: " << cuRandVersion << std::endl;
		std::cout << std::endl;

		RunComparisons();
	}

	catch (const std::exception& exception)
	{
		std::cout << "ERROR: " << exception.what() << std::endl;
	}
}


void RunComparisons()
{
	const size_t numComparisons = 16 * 1000 * 1000;

	RunComparison(CURAND_RNG_PSEUDO_MT19937, "MT19937", numComparisons);
	RunComparison(CURAND_RNG_PSEUDO_MTGP32, "MTGP32", numComparisons);
	RunComparison(CURAND_RNG_PSEUDO_MRG32K3A, "MRG32K3A", numComparisons);
	RunComparison(CURAND_RNG_PSEUDO_XORWOW, "XORWOW", numComparisons);
	RunComparison(CURAND_RNG_PSEUDO_PHILOX4_32_10, "PHILOX4_32_10", numComparisons);
}


void RunComparison(curandRngType_t rngType, const char* const name, size_t numComparisons)
{
	std::cout << "Running " << numComparisons << " comparisons for '" << name << "'..." << std::endl;

	auto begin = high_resolution_clock::now();

	CuRandom refRand(true, rngType);
	CuRandom cpuRand(true, rngType);
	CuRandom gpuRand(false, rngType);

	for (int i = 0; i != numComparisons; ++i)
	{
		// Check offset setting on RNG that supports it.
		if (i == 8000 && rngType != CURAND_RNG_PSEUDO_MT19937 && rngType != CURAND_RNG_PSEUDO_MTGP32)
		{
			const int offset = i + 3;

			// Move reference RNG by repeatedly calling NextFloat().
			for (int j = i; j != offset; ++j)
			{
				refRand.NextFloat();
			}

			cpuRand.SetOffset(offset);
			gpuRand.SetOffset(offset);
		}

		float r = refRand.NextFloat();
		float c = cpuRand.NextFloat();
		float g = gpuRand.NextFloat();

		if (i == 0)
		{
			std::cout << "First: " << r << std::endl;
		}

		if (r != c || c != g)
		{
			std::cout
				<< "ERROR: comparison " << i << " failed for '" << name
				<< "' (" << r << " != " << c << " || " << c << " != " << g << ")"
				<< std::endl;

			break;
		}
	}

	auto end = high_resolution_clock::now();
	auto elapsed = duration_cast<nanoseconds>(end - begin).count();

	std::cout << "Ran comparison in " << elapsed / (1000.0*1000.0) << "ms" << std::endl;
	std::cout << std::endl;
}

I’ve reported internal NVIDIA bug #1922632 for the PHILOX generator offset issue.

If there are meaningful developments, I will advise. Until then, requests for additional information may or may not be responded to. I’m not 100% sure this is a bug in the code (could be a documentation bug) but until there is response by the development teams, I won’t be able to offer any additional information. I have no information on workarounds at this time, except to either not use that function or use that function with a different generator type.