Undefined Behaviour of __syncthreads and branching inside a __device__ function

Hello all,

So I’ve come across a rather curious problem. When I call a device function that draws some random numbers and stores them in every second entry of a a shared array, then the array is not always displayed correctly when I print it, although I call __syncthreads() at the end of the function and (!) after the function was called. Only a third __syncthreads immediately after resolves the problem. I tried also to just stall the kernel with a recursive function call but that didn’t help, the program still needs three successive __syncthreads() calls to work properly. I think the problem lies with branching in the function call but I’m not really sure. It might also be that the RNG is only called from half of the warps, though that would render the point of the __syncthreads function moot, I guess.

The code is as follows:

#include <curand.h>
#include <curand_kernel.h>
#include <cuda_runtime.h>
#include <cuda_runtime_api.h>
#include <cuda.h>
#include <sm_30_intrinsics.h>
#include <device_functions.h>

#include <curand_kernel.h>
/* include MTGP host helper functions */
#include <curand_mtgp32_host.h>
/* include MTGP pre-computed parameter sets */
#include <curand_mtgp32dc_p_11213.h>

#include <stdio.h>

#include "curandMTGP32SpecificDistributions.h"

#define CUDA_CALL(x) do { if((x) != cudaSuccess) { \
    printf("Error at %s:%d\n",__FILE__,__LINE__); \
    exit(EXIT_FAILURE);}} while(0)

#define CURAND_CALL(x) do { if((x) != CURAND_STATUS_SUCCESS) { \
    printf("Error at %s:%d\n",__FILE__,__LINE__); \
    exit(EXIT_FAILURE);}} while(0)

#define CASE_A
#define CASE_B
#define CASE_C
#define CASE_D

__device__ void testFunction(volatile float* testFloat, curandStateMtgp32 *state)
{
	testFloat[threadIdx.x]=0.;

	if (threadIdx.x<128)
	{
		int numActiveThreads = 128;
		int address = threadIdx.x*2;

		testFloat[address] = curand_uniform_mtgp32_specific(state, (unsigned char)threadIdx.x, (unsigned char)numActiveThreads);
	}

	#ifdef CASE_A
		__syncthreads();
	#endif

}

#ifdef CASE_D
	__device__ int fibonacci(int numberOfIterations)
	{
		//this function takes a lot of time, as it should.
		if (numberOfIterations==1)
			return 1;

		if(numberOfIterations==0)
			return 0;
		return fibonacci(numberOfIterations-1) + fibonacci(numberOfIterations-2);
	}
#endif

__device__ void printFloats(volatile float * testFloat)
{
	printf("testFloat[%d] = %f\n", threadIdx.x, testFloat[threadIdx.x]);
}

__global__ void testKernel(curandStateMtgp32 *state)
{
	__shared__ float testFloat[256];

	curandStateMtgp32 *myState = &state[0];

	__syncthreads();

	testFunction(testFloat, myState);

	#ifdef CASE_B
		__syncthreads();
	#endif

	#ifdef CASE_C
		__syncthreads();
	#endif

	#ifdef CASE_D
		int waitNumber = fibonacci(35);

		if(threadIdx.x%32==0)
			printf("The %dth Fibonacci number is %d\n", 35, waitNumber);
	#endif

	printFloats(testFloat);

}

int main()
{

	int numberofMTGP32States = 200;
	curandStateMtgp32 *devMTGPStates;
        mtgp32_kernel_params *devKernelParams;
        int seed = 42;

	CUDA_CALL(cudaMalloc((void **)&devMTGPStates, numberofMTGP32States * sizeof(curandStateMtgp32)));

	/* Allocate space for MTGP kernel parameters */
	CUDA_CALL(cudaMalloc((void**)&devKernelParams, sizeof(mtgp32_kernel_params)));

	/* Reformat from predefined parameter sets to kernel format, */
        /* and copy kernel parameters to device memory               */
        CURAND_CALL(curandMakeMTGP32Constants(mtgp32dc_params_fast_11213, devKernelParams));

        /* Initialize one state per thread block */
        CURAND_CALL(curandMakeMTGP32KernelState(devMTGPStates, mtgp32dc_params_fast_11213, devKernelParams, numberofMTGP32States, seed));

	testKernel<<<1,256>>>(devMTGPStates);

	cudaDeviceSynchronize();

	cudaFree(devMTGPStates);
}

and curandMTGP32SpecificDistributions.h holds just the curandMTGP32Specific RNG plugged in the normal distribution of cuda:

#ifndef CURAND_MTGP32_SPECIFIC_DISTRIBUTIONS_H
#define CURAND_MTGP32_SPECIFIC_DISTRIBUTIONS_H

#include <curand.h>
#include <curand_kernel.h>
#include <curand_globals.h>
#include <curand_uniform.h>
#include <curand_mtgp32_kernel.h>

__device__ __forceinline__ float curand_uniform_mtgp32_specific(curandStateMtgp32 *state, unsigned char index, unsigned char n)
{
        unsigned int randomNumber =curand_mtgp32_specific(state, index, n);
        return _curand_uniform(randomNumber);
}

#endif

I would appreciate some help or explanations for this behaviour since it would be nice to know when __syncthreads() works in such a case or what I did wrong. From the Cuda C documentation I gather that __syncthreads() shouldn’t be called in a branching segment of code but I thought it is still possible to use it outside of it.

I defined three macro variables, CASE_A, CASE_B, CASE_C and CASE_D that make it easy to play a bit around with the code.

In any case thanks in advance to anyone who would help me since this has me really baffled.

Best,

David

So I’ve looked a bit further and it doesn’t seem to be a problem of the device function since if I put the code directly in the kernel, the behaviour doesn’t change, except for the fact that you need one less __syncthreads() before the fibonacci(int) function (which is a bit of a mystery to me, too). I also tried to plug in a __syncthreads berfore the RNG, but that seems to change little.

Best,

David

I’m not able to figure out what your claim is. I ran the code with and without CASE_B defined and in either case, it seemed that every other element output was 0.000000

If you want help, my suggestion would be:

  1. include additional information like platform, CUDA version, compile command, and GPU you are running on
  2. provide a crisp definition of the expected output, and the actual output, and what to do with the defines to create each.

Hi txbob,

thank you for your reply! The above program is designed to

  1. fill every entry of testFloat with 0., then
  2. fill every entry with even-numbered iterator with a random number.
  3. print every entry of testfloat

#define CASE_A enables a __syncthreads() in the testFunction body
#define CASE_B CASE_C and enables __syncthreads() in the kernel, so defining both will give you __syncthreads() twice in succession.
#define CASE_D enables a call to a recursive function calculating Fibonacci numbers which essentially just makes the kernel wait for a while, in case that helps with synchronizing. I’m not sure if it helps but I put it in there just in case it would.

If I define CASE_A - CASE_C, then I get every odd-numbered entry with zero and every even-numbered entry with a random number, as it should be. However, if I only define one or two of them, many of the even-numbered entries are zero as well, which is not what I would expect after a call to __syncthreads().

I run my code on a Quadro P3000 card and compile with

nvcc --gpu-architecture=compute_61 --gpu-code=sm_61 --std=c++11 -g main.cu -o main.prog -lm -lcurand

I run cuda 8.0 and deviceQuery gives

CUDA Driver Version / Runtime Version 9.0 / 8.0

It occured to me that not putting a __syncthreads after line 37 in the above code might be the issue but this doesn’t change anything.

Best,

David

It would appear to me that a __syncthreads() both before and after the if (threadIdx.x<128) block is a necessity.

Christian

I commented out #define CASE_C and #define CASE_D and compiled and ran the code using CUDA 9.1 on Tesla P100 on CentOS 7.

It appears that every odd-index output is zero.

Perhaps you should try CUDA 9.1

@Christian, thank you for your reply. For me, this doesn’t change anything, unfortunately. I get what you mean and it should be put there but it changes nothing on my computer.

@txbob, thanks for your reply. I tested the code with a GTX680 with cuda 6.5 on Ubuntu 14.04.3 LTS and got the same results, the even entries are not correct when only using CASE_A und CASE_B. I will next try cuda 9.1, also with cooperative groups, and see whether then the behaviour is better.

Best,

David

Have you run this program through cuda-memcheck to see if there’s anything broken with respect to the random generator during the initialization or access of generator states?

I have now (thanks, great idea), though no errors came up in either case.

So I have tried to run this with CUDA 9.1 on my quadro card, as well as with the cooperative groups library (block.sync()) and it gave me the same behaviour. Still no errors in memcheck and still the same requirement for three consecutive threadfences (__syncthreads or block.sync).

Does it make a difference if you drop the random generators and write constant floating point values (e.g. 1.0f) into every 2nd element?

Christian

Another thought: Wouldn’t it be better (safer) to have a random generator state per thread instead of per thread block?

So, first of, Christian and txbob, thanks for your help.

I think I figured out the problem: curand_mtgp32_specific has two __syncthreads() calls in it’s body. I think that’s a bit unfortunate since as it takes the number of threads it is called from as an argument, it’s likely (if not certain) to be called in a branching statement. I’ve fixed this with an instance of cooperative_groups::coalesced_threads() syncing at the appropriate points (awesome feature btw), and indeed, I only need one call to either __syncthreads() or group.sync().

I’ll file this at developer.nvidia.com, so maybe it will be changed in the future.

Best and thanks to you guys again,

David

Yes, I observe that if I compile with -lineinfo and run your code with

cuda-memcheck --tool synccheck …

I get divergent barrier errors reported at this line:

testFloat[address] = curand_uniform_mtgp32_specific(state, (unsigned char)threadIdx.x, (unsigned char)numActiveThreads);

Yup, that makes sense. The --tool synccheck is a great feature, I only knew about the memcheck without --tool options before, but I’ll read the memcheck doc. Thanks!

So it occurred to me that cooperative_groups::coalesced_threads() only works on a warp internal level, not between warps (c.f. https://devblogs.nvidia.com/cooperative-groups/). This makes my approach insecure. However, the following code should fix this problem. The function

curand_mtgp32_specific_conditional(curandStateMtgp32_t *state, unsigned char index, unsigned char n, unsigned int &randomNumber, const int &predicate)

has, additionally to the original arguments of curand_mtgp32_specific, the arguments randomNumber and predicate. Most of the body of curand_mtgp32_specific_conditional will only be executed when conditional !=0, and randomNumber will be overwritten with a new random number in the same case. In between the __syncthreads() function will be called in non-branching code. It’s not pretty, but it works for what I have in mind.

The problem here is however, that curand_mtgp32_specific_conditional() still cannot be called inside branching code without causing undefined behaviour. If anyone knows how to synchronize branching code, I would appreciate a pointer very much.

Here’s the new function:

#ifndef CURAND_MTGP32_SPECIFIC_CONDITIONAL_H
#define CURAND_MTGP32_SPECIFIC_CONDITIONAL_H

#include <curand.h>
#include <curand_globals.h>
#include <curand_uniform.h>
#include <curand_mtgp32_kernel.h>
#include <cuda.h>
#include <stdlib.h>
#include <memory.h>
#include <string.h>
#include <curand_mtgp32.h>
#include <cooperative_groups.h>










/**
 * From curand_mtgp32_kernel.h: Altered to accomodate branching (in function, not outside!). This code
 * may not be executed in a branching statement, otherwise it will lead to undefined behaviour.
 *
 * \brief Return 32-bits of pseudorandomness from a specific position in a mtgp32 generator.
 *
 * Return 32-bits of pseudorandomness from position \p index of the mtgp32 generator in \p state,
 * increment position of generator by \p n positions, which must be the total number of positions
 * upddated in the state by the thread block, for this invocation.
 *
 * Note : 
 * Thread indices must range from 0...\ n - 1.
 * The number of positions updated may not exceed 256. 
 * A thread block may update more than one state, but a given state may not be updated by more than one thread block.
 *
 * \param state - Pointer to state to update
 * \param index - Index (0..255) of the position within the state to draw from and update
 * \param n - The total number of postions in this state that are being updated by this invocation
 * 
 * \return 32-bits of pseudorandomness as an unsigned int, all bits valid to use.
 */
__forceinline__ __device__ void curand_mtgp32_specific_conditional(curandStateMtgp32_t *state, unsigned char index, unsigned char n, unsigned int &randomNumber, const int &predicate)
{
    if(predicate)
    {
        unsigned int t;
        int pos = state->k->pos_tbl[state->pIdx];
        unsigned int r;

        t = index;
        r = para_rec(state->k, state->s[(t + state->offset) & MTGP32_STATE_MASK],
                 state->s[(t + state->offset + 1) & MTGP32_STATE_MASK],
                 state->s[(t + state->offset + pos) & MTGP32_STATE_MASK],
                 state->pIdx);

        state->s[(t + state->offset + MTGPDC_N) & MTGP32_STATE_MASK] = r;
        randomNumber = temper(state->k, r,
               state->s[(t + state->offset + pos -1) & MTGP32_STATE_MASK],
               state->pIdx);
    }

#if __CUDA_ARCH__ != 0
    __syncthreads();
#endif

    if(predicate)
    {
        if (index == 0)
        {
            state->offset = (state->offset + n) & MTGP32_STATE_MASK;
        }
    }

#if __CUDA_ARCH__ != 0
    __syncthreads();
#endif

}



__device__ __forceinline__ void curand_uniform_mtgp32_specific_conditional(curandStateMtgp32 *state, unsigned char index, unsigned char n,  float &randomNumber, int predicate)
{
    unsigned int randomInt; 
    curand_mtgp32_specific_conditional(state, index, n, randomInt, predicate);


    if(predicate)
    {
        randomNumber = _curand_uniform(randomInt);
    }
}


__device__ __forceinline__  void curand_uniform_mtgp32_specific_double_conditional(curandStateMtgp32 *state, unsigned char index, unsigned char n, volatile double &randomNumber, const int &predicate)
{
    unsigned int randomIntX; 
    curand_mtgp32_specific_conditional(state, index, n, randomIntX, predicate);

    unsigned int randomIntY; 
    curand_mtgp32_specific_conditional(state, index, n, randomIntY, predicate);

    if(predicate)
    {
       randomNumber = _curand_uniform_double_hq(randomIntX,randomIntY);
    }
}



#endif