Hello!
My goal is to generate uniform random numbers on the device for a raytracer, therefore I want to use 2 curandStateScrambledSobol64 states per thread. These are generated in advance in a setup_kernel and then given to the main kernel. During development I also used a different setup kernel for curandState states, which are not used any more.
Mainly I followed the template from https://docs.nvidia.com/cuda/curand/device-api-overview.html#device-api-overview .
Now if I remove the unused kernel call from the host sequence, then I get a memory violation in the device curand_uniform_double(&globalstate[id]) call. When I leave the unused kernel in, the program runs fine and the output is valid. Also it is important, that I leave the curand_init() call with the thread id in this “dummy” kernel, otherwise it will memory assert too.
As a second workaround I can move the curand_init(0,id,0,&x) to the actual setup kernel (doing the curand_init twice) but the performance drop is huge with about +7s to the total runtime.
Has anyone experienced a similar behavior or an explanation for this?
The exact error message:
“an illegal memory access was encountered”
Memchecker:
“OutOfRangeAddress” after a curand_uniform_double and function return
Here is the code (partially):
//--------------------------------------------------------------------------------------------------
#define N_DIRECTION_VECTORS 20000
/* Number of 64-bit vectors per dimension */
#define VECTOR_SIZE 64
//##################################################################################################
__global__ void dummy_kernel()
{
unsigned int id = blockIdx.x * blockDim.x + threadIdx.x;
curandState x;
curand_init(0, id, 0, &x);
}
//##################################################################################################
__global__ void setup_kernel_s(unsigned long long * __restrict sobolDirectionVectors,
unsigned long long * __restrict sobolScrambleConstants,
curandStateScrambledSobol64 * __restrict state, size_t n_jobs, size_t seed)
{
unsigned int id = blockIdx.x * blockDim.x + threadIdx.x;
if (id >= n_jobs)
return;
size_t vec_id = (id + seed) % N_DIRECTION_VECTORS;
size_t offset = id / N_DIRECTION_VECTORS;
curand_init(sobolDirectionVectors + 64 * vec_id,
sobolScrambleConstants[vec_id],
offset,// * 1234,
&state[id]);
}
//##################################################################################################
__global__ void d_raytrace(const size_t resx,
const size_t resy, const dPrim* prims, const size_t n_prims,
const size_t n_jobs, const size_t samples, dVec* __restrict pixels, curandStateScrambledSobol64* __restrict globalState_a, curandStateScrambledSobol64* __restrict globalState_b)
{
unsigned int id = blockIdx.x * blockDim.x + threadIdx.x;
if (id >= n_jobs)
return;
/* non curand related stuff*/
//...
double rand1 = curand_uniform_double(&globalState_a[id]);
double rand2 = curand_uniform_double(&globalState_b[id]);
/* recursive function call (max depth = 12)
it again generates random numbers and after
another function call the memcheck shows the error during its return
*/
data = f(globalState_a,globalState_b);
//...
pixels[i] = data;
return;
}
//#############################HOST#################################################################
void Raytrace::computeRaytrace(const Camera& cam, const vector<UniformPrim>& prims, const size_t supersample, const size_t samples, vector<Vec>& pixels)
{
const unsigned int blocksize = 64;
size_t gridsize;
size_t n_pixels = cam.resx * cam.resy;
size_t n_jobs = n_pixels;
gridsize = (n_jobs - 1) / blocksize + 1;
size_t shared_directions = n_jobs / N_DIRECTION_VECTORS + 1;
size_t total_launch_threads = gridsize * blocksize;
dVec* d_pixels;
//dPrim* d_prims;
size_t n_prims = prims.size();
curandStateScrambledSobol64 *d_Sobol64States_a;
curandStateScrambledSobol64 *d_Sobol64States_b;
curandDirectionVectors64_t *hostVectors64;
unsigned long long int * hostScrambleConstants64;
unsigned long long int * d_DirectionVectors64;
unsigned long long int * d_ScrambleConstants64;
cudaEvent_t finished;
double calc_approx = 2.2; //seconds for a sample
double total = calc_approx * samples;
printf("\n\nI'm the GPU now\n\n");
//cannot get rid of this dummy kernel without error???
CUDACHECKERROR();
dummy_kernel << < 1, blocksize >> > ();
CUDACHECKERROR();
//CSC(cudaDeviceSynchronize());
//loads no more than N_DIRECTION_VECTORS into host memory
CURAND_CALL(curandGetDirectionVectors64(&hostVectors64,CURAND_SCRAMBLED_DIRECTION_VECTORS_64_JOEKUO6));
CURAND_CALL(curandGetScrambleConstants64(&hostScrambleConstants64));
//full amount of sobel states
CSC(cudaMalloc((void **)&d_Sobol64States_a, total_launch_threads * sizeof(curandStateScrambledSobol64)));
CSC(cudaMalloc((void **)&d_Sobol64States_b, total_launch_threads * sizeof(curandStateScrambledSobol64)));
//limited amount of sobel vectors
CSC(cudaMalloc((void **)&(d_DirectionVectors64),N_DIRECTION_VECTORS * VECTOR_SIZE * sizeof(long long int)));
CSC(cudaMalloc((void **)&(d_ScrambleConstants64), N_DIRECTION_VECTORS * sizeof(long long int)));
CSC(cudaMemcpy(d_DirectionVectors64, hostVectors64,N_DIRECTION_VECTORS * VECTOR_SIZE * sizeof(long long int),cudaMemcpyHostToDevice));
CSC(cudaMemcpy(d_ScrambleConstants64, hostScrambleConstants64, N_DIRECTION_VECTORS * sizeof(long long int),cudaMemcpyHostToDevice));
cout << "create rand generator states" << endl;
//Initialize the states
//two rand states for each pixel, handle x,y seperate for better distribution
CUDACHECKERROR();
setup_kernel_s << <gridsize, blocksize >> > (d_DirectionVectors64, d_ScrambleConstants64, d_Sobol64States_a, total_launch_threads, unsigned(time(NULL)));
CUDACHECKERROR();
setup_kernel_s << <gridsize, blocksize >> > (d_DirectionVectors64, d_ScrambleConstants64, d_Sobol64States_b, total_launch_threads, unsigned(1 + time(NULL)));
CUDACHECKERROR();
CSC(cudaFree(d_DirectionVectors64));
d_DirectionVectors64 = nullptr;
CSC(cudaFree(d_ScrambleConstants64));
d_ScrambleConstants64 = nullptr;
cout << "begin raytesting" << endl;
CUDACHECKERROR();
//in this kernel is the error when no dummy kernel has run
d_raytrace << <gridsize, blocksize >> > (cam.resx, cam.resy, 0, n_prims, n_jobs, samples, d_pixels, d_Sobol64States_a, d_Sobol64States_b);
CUDACHECKERROR();
...
}
//--------------------------------------------------------------------------------------------------