Okay, I have made a MWE which does reproduce the strangely(?) long initialization time, but not the slowness of drawing on 9.2. So I started tinkering with my full code’s kernel which calls a function that draws 12 random numbers (per thread); changing the number of draws per thread proportionately changes the kernel time. Thus, I suppose some sort of caching is no longer happening on 9.2? I.e., on 9.2 the global array is being read to (and written to, as the curandState is updated each time a number is drawn) each of the 12 times.
I was able to erase the performance loss by, rather than passing the pointer to the global curandState array to this function, passing the curandState itself, and then at the end set the global array’s value to the (updated) local curandState.
Does this assessment seem reasonable?
In the below MWE I implement both methods (always reading the global array vs. storing the curandState in registers):
// main.cu
// testing curand
#include <stdio.h>
#include <cuda.h>
#include <cuda_profiler_api.h>
#include <curand.h>
#include <curand_kernel.h>
#define N1 128
#define N (N1*N1*N1)
#define randseed 1230123
// wrapper for CUDA API functions to check for errors
#define gpuERR(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, const int line, bool abort=true)
{
if(code != cudaSuccess)
{
fprintf(stderr,"%s:%d: CUDA API error: %s\n",file,line,cudaGetErrorString(code));
if(abort)
{
cudaDeviceReset();
exit(code);
}
}
}
__global__ void curandInit(curandState *state)
{
int idx = threadIdx.x + blockDim.x * blockIdx.x;
if(idx < N) curand_init(randseed, idx, 0, &state[idx]);
}
__global__ void curandDraw(curandState *state, double *out)
{
int idx = threadIdx.x + blockDim.x * blockIdx.x;
double rand = 0.;
if(idx < N)
{
for(int i = 0; i < 12; i++) rand += curand_uniform_double(state + idx) - .5;
out[idx] = rand;
}
}
__global__ void curandDraw_fast(curandState *state, double *out)
{
int idx = threadIdx.x + blockDim.x * blockIdx.x;
double rand = 0.;
if(idx < N)
{
curandState tstate = state[idx];
for(int i = 0; i < 12; i++) rand += curand_uniform_double(&tstate) - .5;
out[idx] = rand;
state[idx] = tstate;
}
}
curandState *randState;
double *out;
int main()
{
gpuERR( cudaMallocManaged((void**) &randState, sizeof(curandState) * N ) );
gpuERR( cudaMallocManaged((void**) &out, sizeof(double) * N ) );
cudaEvent_t t0, tf;
float time;
cudaEventCreate(&t0);
cudaEventCreate(&tf);
cudaEventRecord(t0,0);
curandInit<<< N / 512, 512 >>>(randState);
gpuERR( cudaPeekAtLastError() ); gpuERR( cudaDeviceSynchronize() );
cudaEventRecord(tf,0);
cudaEventSynchronize(tf);
cudaEventElapsedTime(&time,t0,tf);
printf("Initialization took %f seconds\n",time/1000.);
cudaEventRecord(t0,0);
int nloop = 1000;
for(int i = 0; i < nloop; i++)
{
curandDraw<<< N / 32, 32 >>>(randState, out);
gpuERR( cudaPeekAtLastError() ); gpuERR( cudaDeviceSynchronize() );
}
cudaEventRecord(tf,0);
cudaEventSynchronize(tf);
cudaEventElapsedTime(&time,t0,tf);
printf("Draw kernel took %f ms\n",time/(double)nloop);
cudaEventRecord(t0,0);
for(int i = 0; i < nloop; i++)
{
curandDraw_fast<<< N / 32, 32 >>>(randState, out);
gpuERR( cudaPeekAtLastError() ); gpuERR( cudaDeviceSynchronize() );
}
cudaEventRecord(tf,0);
cudaEventSynchronize(tf);
cudaEventElapsedTime(&time,t0,tf);
printf("Fast draw kernel took %f ms\n",time/(double)nloop);
cudaEventDestroy(t0);
cudaEventDestroy(tf);
cudaDeviceReset();
}
Compile (for Pascal) with
nvcc -dc -m64 -lineinfo -arch=sm_60 main.cu -o main.o
nvcc main.o -arch=sm_60 -o rand
On my P100, CUDA v9.2 the result is
Initialization took 4.159226 seconds
Draw kernel took 0.551846 ms
Fast draw kernel took 0.703808 ms
So something (register usage?) in my full code’s kernel must be making the difference, because the “fast” kernel is not in fact fast in this MWE. The results are consistent across CUDA versions.