MersenneTwister example doesn't run on big endian machines

Hi!

I’m just trying to run a few of the OpenCL samples on my PPC machine. Unfortunately, the MersenneTwister segfaults. The reason is that it reads a memory dump generated for a litte-endian machine, which is interpreted incorrectly on my machine. Some of these values are used as bound in a for loop writing into an array, and evenutally into not allocated memory… As I don’t understand the code well enough, I can’t provide a patch to fix the problem. However, the following patch makes the sample run without segfaulting on my machine, but it doens’t pass the test yet. Can anyone help me out?

Cheers,

Ingo

diff -pur NVIDIA_GPU_Computing_SDK/OpenCL/src/oclMersenneTwister/src/oclMersenneTwister.cpp IBM_patched-NVIDIA_GPU_Computing_SDK/OpenCL/src/oclMersenneTwister/src/oclMersenneTwister.cpp

--- NVIDIA_GPU_Computing_SDK/OpenCL/src/oclMersenneTwister/src/oclMersenneTwister.cpp   2010-03-31 16:20:20.000000000 +0200															  

+++ IBM_patched-NVIDIA_GPU_Computing_SDK/OpenCL/src/oclMersenneTwister/src/oclMersenneTwister.cpp	   2010-03-31 19:14:12.000000000 +0200											  

@@ -33,6 +33,14 @@ extern "C" void RandomRef(float *h_Rand,																															  

 extern "C" void BoxMullerRef(float *h_Rand, int nPerRng);																															   

 #endif																																												  

																																														 

+inline void endian_swap(unsigned int& x)																																				

+{																																													   

+	x = (x>>24) |																																									   

+		((x<<8) & 0x00FF0000) |																																						 

+		((x>>8) & 0x0000FF00) |																																						 

+		(x<<24);																																										

+}																																													   

+																																														

 ///////////////////////////////////////////////////////////////////////////////																										 

 //Load twister configurations																																						   

 ///////////////////////////////////////////////////////////////////////////////																										 

@@ -58,8 +66,15 @@ void loadMTGPU(const char *fname,																																	 

				oclCheckError(0, 1);																																					 

		 }																																											   

																																														 

-	for (unsigned int i = 0; i < size; i++)																																			 

+	for (unsigned int i = 0; i < size; i++) {																																		   

		 fread(&h_MT[i], sizeof(mt_struct_stripped), 1, fd);																															 

+																																														

+		for( int j = 0; j < 4; j++) {																																				   

+			unsigned int x = *((int*)(h_MT+i) + j);																																	 

+			endian_swap(x);

+			*((int*)(h_MT+i) + j) = x;

+		}

+	}

	 fclose(fd);

	 for(unsigned int i = 0; i < size; i++)

@@ -73,7 +88,7 @@ int main(int argc, const char **argv)

 {

	 cl_context cxGPUContext;						// OpenCL context

	 cl_command_queue cqCommandQueue[MAX_GPU_COUNT]; // OpenCL command que

-	cl_platform_id cpPlatform;					  // OpenCL platform

+	cl_platform_id cpPlatform = NULL;					  // OpenCL platform

	 cl_uint nDevice;								// OpenCL device count

	 cl_device_id* cdDevices;						// OpenCL device list

	 cl_program cpProgram;						   // OpenCL program

@@ -94,12 +109,12 @@ int main(int argc, const char **argv)

	 shrLog("Get platforms...\n");

	 ciErr1 = oclGetPlatformID(&cpPlatform);

	 oclCheckError(ciErr1, CL_SUCCESS);

-

+

	 shrLog("Get devices...\n");

-	ciErr1 = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &nDevice);

+	ciErr1 = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_ALL, 0, NULL, &nDevice);

	 oclCheckError(ciErr1, CL_SUCCESS);

	 cdDevices = (cl_device_id *)malloc(nDevice * sizeof(cl_device_id) );

-	ciErr1 = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, nDevice, cdDevices, NULL);

+	ciErr1 = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_ALL, nDevice, cdDevices, NULL);

	 oclCheckError(ciErr1, CL_SUCCESS);

	 shrLog("Create context...\n");

diff -pur NVIDIA_GPU_Computing_SDK/OpenCL/src/oclMersenneTwister/src/oclMersenneTwister_gold.cpp IBM_patched-NVIDIA_GPU_Computing_SDK/OpenCL/src/oclMersenneTwister/src/oclMersenneTwister_gold.cpp

--- NVIDIA_GPU_Computing_SDK/OpenCL/src/oclMersenneTwister/src/oclMersenneTwister_gold.cpp	  2010-03-31 16:20:20.000000000 +0200

+++ IBM_patched-NVIDIA_GPU_Computing_SDK/OpenCL/src/oclMersenneTwister/src/oclMersenneTwister_gold.cpp  2010-03-31 19:12:32.000000000 +0200

@@ -20,6 +20,14 @@

 static mt_struct MT[MT_RNG_COUNT];

 static uint32_t state[MT_NN];

+inline void endian_swap(unsigned int& x)

+{

+	x = (x>>24) |

+		((x<<8) & 0x00FF0000) |

+		((x>>8) & 0x0000FF00) |

+		(x<<24);

+}

+

 extern "C" void initMTRef(const char *fname){

	 FILE* fd = 0;

@@ -47,6 +55,11 @@ extern "C" void initMTRef(const char *fn

		 {

				oclCheckError(0, 1);

		 }

+		for( int j = 0; j < 16; j++) {

+			unsigned int x = *((int*)(MT+i) + j);

+			endian_swap(x);

+			*((int*)(MT+i) + j) = x;

+		}

	 }

	 fclose(fd);

Hi!

I’ve got the program running now. IBM’s OpenCL implementation I used for my PPC requires the reqd_work_group_size attribute. With that set, and the patch below, the sample works. I also found Victor Podlozhnyuk’s paper about MersenneTwister on Cuda, which helps a lot in understanding the code.

Cheers,
Ingo