Hello,
I am trying to re-write Nvidia’s CUDA Mersenne Twister shared memory version in OpenCL. I basically needed it with generating on the fly random numbers rather than pre-generated ones.
this is essentially the CUDA version :
#define NVG80 /* For Nvidia G80 achitecture where mod is VERY slow */
#ifdef NVG80
#define mod(x, y) ((x) < (y) ? (x) : (x) - (y)) /* Short mod - known input range */
#else
#define mod(x, y) ((x) % (y))
#endif
#define N 624
#define M 397
#define INIT_MULT 1812433253 /* See Knuth TAOCP Vol2. 3rd Ed. P.106 for multiplier. /
#define ARRAY_SEED 19650218 / Seed for initial setup before incorp array seed /
#define MATRIX_A 0x9908b0df / Constant vector a /
#define UPPER_MASK 0x80000000 / Most significant w-r bits /
#define LOWER_MASK 0x7fffffff / Least significant r bits */
#define TEMPER1 0x9d2c5680
#define TEMPER2 0xefc60000
shared int mtNexts; /* Start of next block of seeds */
shared uint s_seeds[N + 1];
/* Init by single seed - single threaded as only used once */
device void mt19937si(uint seed)
{
int i;
if (threadIdx.x == 0)
{
mtNexts = 0;
s_seeds[0] = seed;
for (i = 1; i < N; i++)
{
seed = (INIT_MULT * (seed ^ (seed >> 30)) + i);
s_seeds[i] = seed;
}
}
__syncthreads(); /* Ensure mtNexts set & needed for mt19937w() */
return;
}
device uint mt19937s(void)
{
int kk;
uint y;
const int tid = threadIdx.x;
kk = mod(mtNexts + tid, N);
__syncthreads(); /* Finished with mtNexts & s_seed[] ready from last run */
if (tid == blockDim.x - 1)
{
mtNexts = kk + 1; /* Will get modded on next call */
}
y = (s_seeds[kk] & UPPER_MASK) | (s_seeds[kk + 1] & LOWER_MASK);
y = s_seeds[kk < N - M ? kk + M : kk + (M - N)] ^ (y >> 1) ^ mag01[y & 1];
//y = s_seeds[kk < N - M ? kk + M : kk + (M - N)] ^ (y >> 1) ^ (y & 1 ? MATRIX_A : 0); // Same speed
__syncthreads(); /* All done before we update */
s_seeds[kk] = y;
if (kk == 0) /* Copy up for next round */
{
s_seeds[N] = y;
}
y ^= (y >> 11); /* Tempering */
y ^= (y << 7) & TEMPER1;
y ^= (y << 15) & TEMPER2;
y ^= (y >> 18);
return y;
}
For the OpenCL bit, I merged the two functions as I did not know how to maintain state between those functions and could not declare mtNexts and s_seeds as shared out of any function. I came up with the following in OpenCL :
#define NVG80 /* For Nvidia G80 achitecture where mod is VERY slow */
#ifdef NVG80
#define mod(x, y) ((x) < (y) ? (x) : (x) - (y)) /* Short mod - known input range */
#else
#define mod(x, y) ((x) % (y))
#endif
#define N 624
#define M 397
#define INIT_MULT 1812433253 /* See Knuth TAOCP Vol2. 3rd Ed. P.106 for multiplier. /
#define ARRAY_SEED 19650218 / Seed for initial setup before incorp array seed /
#define MATRIX_A 0x9908b0df / Constant vector a /
#define UPPER_MASK 0x80000000 / Most significant w-r bits /
#define LOWER_MASK 0x7fffffff / Least significant r bits */
#define TEMPER1 0x9d2c5680
#define TEMPER2 0xefc60000
__constant int mag01[2] = {0,0x9908b0df}; // you can’t use MATRIX_A as you have __constant
unsigned int my_mt19937s(unsigned int seed) {
int mtNexts; /* Start of next block of seeds */
unsigned int s_seeds[N + 1];
int i;
if (get_local_id(0) == 0)
{
mtNexts = 0;
s_seeds[0] = seed;
for (i = 1; i < N; i++)
{
seed = (INIT_MULT * (seed ^ (seed >> 30)) + i);
s_seeds[i] = seed;
}
}
barrier(CLK_LOCAL_MEM_FENCE); /* Ensure mtNexts set & needed for mt19937w() */
int kk;
unsigned int y;
const int tid = get_local_id(0);
kk = mod(mtNexts + tid, N);
barrier(CLK_LOCAL_MEM_FENCE); /* Finished with mtNexts & s_seed[] ready from last run */
if (tid == get_local_size(0) - 1)
{
mtNexts = kk + 1; /* Will get modded on next call */
}
y = (s_seeds[kk] & UPPER_MASK) | (s_seeds[kk + 1] & LOWER_MASK);
y = s_seeds[kk < N - M ? kk + M : kk + (M - N)] ^ (y >> 1) ^ mag01[y & 1];
//y = s_seeds[kk < N - M ? kk + M : kk + (M - N)] ^ (y >> 1) ^ (y & 1 ? MATRIX_A : 0); // Same speed
barrier(CLK_LOCAL_MEM_FENCE); /* All done before we update */
s_seeds[kk] = y;
if (kk == 0) /* Copy up for next round */
{
s_seeds[N] = y;
}
y ^= (y >> 11); /* Tempering */
y ^= (y << 7) & TEMPER1;
y ^= (y << 15) & TEMPER2;
y ^= (y >> 18);
return y;
}
In my kernel function I call my_mt19937s(get_global_id(0)) 1000 times but I get a correct value only for the first time, and for the rest I get 0, any ideas what is going wrong? If you require any additional info please let me know.
Thanks,
- vihan