CUDA curand memory error without a "dummy kernel"

Hello!
My goal is to generate uniform random numbers on the device for a raytracer, therefore I want to use 2 curandStateScrambledSobol64 states per thread. These are generated in advance in a setup_kernel and then given to the main kernel. During development I also used a different setup kernel for curandState states, which are not used any more.

Mainly I followed the template from https://docs.nvidia.com/cuda/curand/device-api-overview.html#device-api-overview .

Now if I remove the unused kernel call from the host sequence, then I get a memory violation in the device curand_uniform_double(&globalstate[id]) call. When I leave the unused kernel in, the program runs fine and the output is valid. Also it is important, that I leave the curand_init() call with the thread id in this “dummy” kernel, otherwise it will memory assert too.
As a second workaround I can move the curand_init(0,id,0,&x) to the actual setup kernel (doing the curand_init twice) but the performance drop is huge with about +7s to the total runtime.

Has anyone experienced a similar behavior or an explanation for this?

The exact error message:
“an illegal memory access was encountered”

Memchecker:
“OutOfRangeAddress” after a curand_uniform_double and function return

Here is the code (partially):
//--------------------------------------------------------------------------------------------------

#define N_DIRECTION_VECTORS 20000
/* Number of 64-bit vectors per dimension */
#define VECTOR_SIZE 64

//##################################################################################################
__global__ void dummy_kernel()
{
	unsigned int id = blockIdx.x * blockDim.x + threadIdx.x;
	curandState x;
	curand_init(0, id, 0, &x);
}

//##################################################################################################
__global__ void setup_kernel_s(unsigned long long * __restrict sobolDirectionVectors,
	unsigned long long * __restrict sobolScrambleConstants,
	curandStateScrambledSobol64 * __restrict state, size_t n_jobs, size_t seed)
{
	unsigned int id = blockIdx.x * blockDim.x + threadIdx.x;

	if (id >= n_jobs)
		return;

	size_t vec_id = (id + seed) % N_DIRECTION_VECTORS;
	size_t offset = id / N_DIRECTION_VECTORS;

	curand_init(sobolDirectionVectors + 64 * vec_id,
		sobolScrambleConstants[vec_id],
		offset,// * 1234,
		&state[id]);
}

//##################################################################################################
__global__ void d_raytrace(const size_t resx, 
	const size_t resy, const dPrim* prims, const size_t n_prims, 
	const size_t n_jobs, const size_t samples, dVec* __restrict pixels, curandStateScrambledSobol64* __restrict globalState_a, curandStateScrambledSobol64* __restrict globalState_b)
{
	unsigned int id = blockIdx.x * blockDim.x + threadIdx.x;

	if (id >= n_jobs)
		return;

        /* non curand related stuff*/
        //...

	double rand1 = curand_uniform_double(&globalState_a[id]);
	double rand2 = curand_uniform_double(&globalState_b[id]);

        /* recursive function call (max depth = 12)  
           it again generates random numbers and after 
           another function call the memcheck shows the error during its return
        */
        data = f(globalState_a,globalState_b);
        //...

	pixels[i] = data;
	return;
}	

//#############################HOST#################################################################
void Raytrace::computeRaytrace(const Camera& cam, const vector<UniformPrim>& prims, const size_t supersample, const size_t samples, vector<Vec>& pixels)
{
	const unsigned int blocksize = 64;
	size_t gridsize;
	size_t n_pixels = cam.resx * cam.resy;
	size_t n_jobs = n_pixels;
	gridsize = (n_jobs - 1) / blocksize + 1;
	size_t shared_directions = n_jobs / N_DIRECTION_VECTORS + 1;
	size_t total_launch_threads = gridsize * blocksize;

	dVec* d_pixels;
	//dPrim* d_prims;
	size_t n_prims = prims.size();

	curandStateScrambledSobol64 *d_Sobol64States_a;
	curandStateScrambledSobol64 *d_Sobol64States_b;
	curandDirectionVectors64_t *hostVectors64;
	unsigned long long int * hostScrambleConstants64;
	unsigned long long int * d_DirectionVectors64;
	unsigned long long int * d_ScrambleConstants64;

	cudaEvent_t finished;

	double calc_approx = 2.2; //seconds for a sample
	double total = calc_approx * samples;

	printf("\n\nI'm the GPU now\n\n");

	//cannot get rid of this dummy kernel without error???
	CUDACHECKERROR();
	dummy_kernel << < 1, blocksize >> > ();
	CUDACHECKERROR();
	//CSC(cudaDeviceSynchronize());

	//loads no more than N_DIRECTION_VECTORS into host memory
	CURAND_CALL(curandGetDirectionVectors64(&hostVectors64,CURAND_SCRAMBLED_DIRECTION_VECTORS_64_JOEKUO6));
	CURAND_CALL(curandGetScrambleConstants64(&hostScrambleConstants64));

	//full amount of sobel states
	CSC(cudaMalloc((void **)&d_Sobol64States_a, total_launch_threads * sizeof(curandStateScrambledSobol64)));
	CSC(cudaMalloc((void **)&d_Sobol64States_b, total_launch_threads * sizeof(curandStateScrambledSobol64)));

	//limited amount of sobel vectors
	CSC(cudaMalloc((void **)&(d_DirectionVectors64),N_DIRECTION_VECTORS  * VECTOR_SIZE * sizeof(long long int)));
	CSC(cudaMalloc((void **)&(d_ScrambleConstants64), N_DIRECTION_VECTORS * sizeof(long long int)));
	CSC(cudaMemcpy(d_DirectionVectors64, hostVectors64,N_DIRECTION_VECTORS * VECTOR_SIZE * sizeof(long long int),cudaMemcpyHostToDevice));
	CSC(cudaMemcpy(d_ScrambleConstants64, hostScrambleConstants64,	N_DIRECTION_VECTORS * sizeof(long long int),cudaMemcpyHostToDevice));

	cout << "create rand generator states" << endl;
	//Initialize the states 
	//two rand states for each pixel, handle x,y seperate for better distribution
	CUDACHECKERROR();
	setup_kernel_s << <gridsize, blocksize >> > (d_DirectionVectors64, d_ScrambleConstants64, d_Sobol64States_a, total_launch_threads, unsigned(time(NULL)));
	CUDACHECKERROR();
	setup_kernel_s << <gridsize, blocksize >> > (d_DirectionVectors64, d_ScrambleConstants64, d_Sobol64States_b, total_launch_threads, unsigned(1 + time(NULL)));
	CUDACHECKERROR();

	CSC(cudaFree(d_DirectionVectors64));
	d_DirectionVectors64 = nullptr;
	CSC(cudaFree(d_ScrambleConstants64));
	d_ScrambleConstants64 = nullptr;

	cout << "begin raytesting" << endl;
	CUDACHECKERROR();

        //in this kernel is the error when no dummy kernel has run
	d_raytrace << <gridsize, blocksize >> > (cam.resx, cam.resy, 0, n_prims, n_jobs, samples, d_pixels, d_Sobol64States_a, d_Sobol64States_b);

	CUDACHECKERROR();
        ...
}

//--------------------------------------------------------------------------------------------------

Now if I remove the unused kernel call from the host sequence, then I get a memory violation in the device curand_uniform_double(&globalstate[id]) call.

What kernel was removed from the host sequence? Maybe I’m missing something.

Also, is there a reason you’re freeing device memory before the kernel? Those would usually go after you finishing using the GPU unless you’re limited on memory.

It would be much easier to debug if you would provide a working example that can be compiled.

Thank you for the quick response! Sorry for beeing unprecise, by unused kernel call I mean the “dummy_kernel << < 1, blocksize >> > ();” in Line 93 in the patch. I comment it out like

CUDACHECKERROR();
//dummy_kernel << < 1, blocksize >> > ();
CUDACHECKERROR();

I just try be efficient here and free up unused space since the sobol constants are only used in the setup kernel.

You are right, I will follow up with an example as soon as I have it.