Hello,
I am an intern using CUDA to parallelize a fluid dynamics simulation that my professor wrote. His code is using M-Twister to generate an array of random numbers because this method is recommended in the literature.
I came up with another way that I think would be faster. If I need an array of N random numbers, I invoke a kernel with N threads. For each thread, I send the thread ID to a hash-function. This function takes a single integer as a parameter and returns an integer that is uniformly distributed across a large range. Sha1 and MD5 can be used for this.
Attached is a program that uses this method to generate 1,000 random numbers between 0 and 9 and prints the frequency of each generated number. To my eye, it looks ok. The hash function was not written by me, I found it in http://burtleburtle.net/bob/hash/integer.html
My professor and I are not sure if we should use this, so we would like to ask for your opinions and criticism.
Thanks a lot!! :)
#include <stdio.h>
// hash algorithm from http://burtleburtle.net/bob/hash/integer.html
__device__ int hash(int a) {
a = (a ^ 61) ^ (a >> 16);
a = a + (a << 3);
a = a ^ (a >> 4);
a = a * 0x27d4eb2d;
a = a ^ (a >> 15);
return a;
}
__global__ void randomGen(int *arr, int length, int randNumRange)
{
int id = blockDim.x * blockIdx.x + threadIdx.x;
if (id < length) {
arr[id] = hash(id) % randNumRange;
}
}
/////////////////////////////////////////////////////////////////////
// Program main
/////////////////////////////////////////////////////////////////////
int main( int argc, char** argv)
{
// setup -- generate 1000 random numbers, each from 0 to 10
int dimA = 1000;
int randNumRange = 10;
int numThreadsPerBlock = 512;
int numBlocks = dimA / numThreadsPerBlock + (dimA % numThreadsPerBlock == 0 ? 0 : 1);
// memory allocation
size_t memSize = numBlocks * numThreadsPerBlock * sizeof(int);
int *h = (int *) malloc(memSize);
int *d;
cudaMalloc( (void **) &d, memSize );
// kernel call
randomGen<<< numBlocks, numThreadsPerBlock >>>( d, dimA, randNumRange );
// memory copy from dev to host
cudaMemcpy( h, d, memSize, cudaMemcpyDeviceToHost );
// initialize, count and print the frequency of occurance of each number within the range.
int *freq = (int *) calloc(randNumRange, sizeof(int));
for (int i = 0; i < randNumRange; i++) {
freq[i] = 0;
}
for (int i = 0; i < dimA; i++)
{
freq[h[i]]++;
}
for (int i = 0; i < randNumRange; i++) {
printf("%d: %d\n", i, freq[i]);
}
// free device memory
cudaFree(d);
// free host memory
free(h);
free(freq);
}
random_gen_cuda.cu (1.57 KB)