Hello all,
So I’ve come across a rather curious problem. When I call a device function that draws some random numbers and stores them in every second entry of a a shared array, then the array is not always displayed correctly when I print it, although I call __syncthreads() at the end of the function and (!) after the function was called. Only a third __syncthreads immediately after resolves the problem. I tried also to just stall the kernel with a recursive function call but that didn’t help, the program still needs three successive __syncthreads() calls to work properly. I think the problem lies with branching in the function call but I’m not really sure. It might also be that the RNG is only called from half of the warps, though that would render the point of the __syncthreads function moot, I guess.
The code is as follows:
#include <curand.h>
#include <curand_kernel.h>
#include <cuda_runtime.h>
#include <cuda_runtime_api.h>
#include <cuda.h>
#include <sm_30_intrinsics.h>
#include <device_functions.h>
#include <curand_kernel.h>
/* include MTGP host helper functions */
#include <curand_mtgp32_host.h>
/* include MTGP pre-computed parameter sets */
#include <curand_mtgp32dc_p_11213.h>
#include <stdio.h>
#include "curandMTGP32SpecificDistributions.h"
#define CUDA_CALL(x) do { if((x) != cudaSuccess) { \
printf("Error at %s:%d\n",__FILE__,__LINE__); \
exit(EXIT_FAILURE);}} while(0)
#define CURAND_CALL(x) do { if((x) != CURAND_STATUS_SUCCESS) { \
printf("Error at %s:%d\n",__FILE__,__LINE__); \
exit(EXIT_FAILURE);}} while(0)
#define CASE_A
#define CASE_B
#define CASE_C
#define CASE_D
__device__ void testFunction(volatile float* testFloat, curandStateMtgp32 *state)
{
testFloat[threadIdx.x]=0.;
if (threadIdx.x<128)
{
int numActiveThreads = 128;
int address = threadIdx.x*2;
testFloat[address] = curand_uniform_mtgp32_specific(state, (unsigned char)threadIdx.x, (unsigned char)numActiveThreads);
}
#ifdef CASE_A
__syncthreads();
#endif
}
#ifdef CASE_D
__device__ int fibonacci(int numberOfIterations)
{
//this function takes a lot of time, as it should.
if (numberOfIterations==1)
return 1;
if(numberOfIterations==0)
return 0;
return fibonacci(numberOfIterations-1) + fibonacci(numberOfIterations-2);
}
#endif
__device__ void printFloats(volatile float * testFloat)
{
printf("testFloat[%d] = %f\n", threadIdx.x, testFloat[threadIdx.x]);
}
__global__ void testKernel(curandStateMtgp32 *state)
{
__shared__ float testFloat[256];
curandStateMtgp32 *myState = &state[0];
__syncthreads();
testFunction(testFloat, myState);
#ifdef CASE_B
__syncthreads();
#endif
#ifdef CASE_C
__syncthreads();
#endif
#ifdef CASE_D
int waitNumber = fibonacci(35);
if(threadIdx.x%32==0)
printf("The %dth Fibonacci number is %d\n", 35, waitNumber);
#endif
printFloats(testFloat);
}
int main()
{
int numberofMTGP32States = 200;
curandStateMtgp32 *devMTGPStates;
mtgp32_kernel_params *devKernelParams;
int seed = 42;
CUDA_CALL(cudaMalloc((void **)&devMTGPStates, numberofMTGP32States * sizeof(curandStateMtgp32)));
/* Allocate space for MTGP kernel parameters */
CUDA_CALL(cudaMalloc((void**)&devKernelParams, sizeof(mtgp32_kernel_params)));
/* Reformat from predefined parameter sets to kernel format, */
/* and copy kernel parameters to device memory */
CURAND_CALL(curandMakeMTGP32Constants(mtgp32dc_params_fast_11213, devKernelParams));
/* Initialize one state per thread block */
CURAND_CALL(curandMakeMTGP32KernelState(devMTGPStates, mtgp32dc_params_fast_11213, devKernelParams, numberofMTGP32States, seed));
testKernel<<<1,256>>>(devMTGPStates);
cudaDeviceSynchronize();
cudaFree(devMTGPStates);
}
and curandMTGP32SpecificDistributions.h holds just the curandMTGP32Specific RNG plugged in the normal distribution of cuda:
#ifndef CURAND_MTGP32_SPECIFIC_DISTRIBUTIONS_H
#define CURAND_MTGP32_SPECIFIC_DISTRIBUTIONS_H
#include <curand.h>
#include <curand_kernel.h>
#include <curand_globals.h>
#include <curand_uniform.h>
#include <curand_mtgp32_kernel.h>
__device__ __forceinline__ float curand_uniform_mtgp32_specific(curandStateMtgp32 *state, unsigned char index, unsigned char n)
{
unsigned int randomNumber =curand_mtgp32_specific(state, index, n);
return _curand_uniform(randomNumber);
}
#endif
I would appreciate some help or explanations for this behaviour since it would be nice to know when __syncthreads() works in such a case or what I did wrong. From the Cuda C documentation I gather that __syncthreads() shouldn’t be called in a branching segment of code but I thought it is still possible to use it outside of it.
I defined three macro variables, CASE_A, CASE_B, CASE_C and CASE_D that make it easy to play a bit around with the code.
In any case thanks in advance to anyone who would help me since this has me really baffled.
Best,
David