Does the volatile keyword work properly on global memory

I’m using volatile for Reducing with Unrolled Warps, but volatile didn’t work the way I expected.
When I used ‘volatile’ on global memory, I got a wrong outputs, and the output changes from run to run. However, volatile works well on shared memory. How can volatile be used correctly?

#include<cuda_runtime.h>
#include<iostream>

#define CUDA_CHECK(call)\
{\
const cudaError_t error = call;\
if(error != cudaSuccess)\
{\
printf("Error: %s:%d, ", __FILE__, __LINE__);\
printf("code:%d, reason: %s\n", error, cudaGetErrorString(error));\
exit(1);\
}\
}

using namespace std;

#define len int(1<<24)
#define size len * sizeof(int)
#define uint unsigned int

template<typename scalar_t>
__global__ void reduceGmemKernel(scalar_t* A)
{
	uint tid = threadIdx.x;
	scalar_t* idata = A + blockIdx.x * blockDim.x;

	uint idx = threadIdx.x + blockDim.x * blockIdx.x;
	if (idx >= len)
		return;

	if (blockDim.x >= 1024 && tid < 512)
		idata[tid] += idata[tid + 512];
	__syncthreads();
	if (blockDim.x >= 512 && tid < 256)
		idata[tid] += idata[tid + 256];
	__syncthreads();
	if (blockDim.x >= 256 && tid < 128)
		idata[tid] += idata[tid + 128];
	__syncthreads();
	if (blockDim.x >= 128 && tid < 64)
		idata[tid] += idata[tid + 64];
	__syncthreads();

	// without unrolling warp and volatile, outputs is correct
	//if (blockDim.x >= 64 && tid < 32)
	//	idata[tid] += idata[tid + 32];
	//__syncthreads();
	//if (blockDim.x >= 32 && tid < 16)
	//	idata[tid] += idata[tid + 16];
	//__syncthreads();
	//if (blockDim.x >= 16 && tid < 8)
	//	idata[tid] += idata[tid + 8];
	//__syncthreads();
	//if (blockDim.x >= 8 && tid < 4)
	//	idata[tid] += idata[tid + 4];
	//__syncthreads();
	//if (blockDim.x >= 4 && tid < 2)
	//	idata[tid] += idata[tid + 2];
	//__syncthreads();
	//if (blockDim.x >= 2 && tid < 1)
	//	idata[tid] += idata[tid + 1];
	//__syncthreads();
	
	/*-------------------------------------------------------------------------------------------------*/
	// using volatile for unrolling warp, the outputs is incorrect
	if (tid < 32)
	{
		volatile scalar_t* vsmem = idata;
		vsmem[tid] += vsmem[tid + 32];
		vsmem[tid] += vsmem[tid + 16];
		vsmem[tid] += vsmem[tid + 8];
		vsmem[tid] += vsmem[tid + 4];
		vsmem[tid] += vsmem[tid + 2];
		vsmem[tid] += vsmem[tid + 1];
	}

	if (tid == 0)
		A[blockIdx.x] = idata[0];
}

template<typename scalar_t>
__global__ void reduceSmemKernel(scalar_t* A)
{
	uint tid = threadIdx.x;
	uint idx = threadIdx.x + blockDim.x * blockIdx.x;
	
	extern __shared__ scalar_t smem[];

	if (idx >= len)
		return;

	smem[tid] = A[idx];
	__syncthreads();

	if (blockDim.x >= 1024 && tid < 512)
		smem[tid] += smem[tid + 512];
	__syncthreads();
	if (blockDim.x >= 512 && tid < 256)
		smem[tid] += smem[tid + 256];
	__syncthreads();
	if (blockDim.x >= 256 && tid < 128)
		smem[tid] += smem[tid + 128];
	__syncthreads();
	if (blockDim.x >= 128 && tid < 64)
		smem[tid] += smem[tid + 64];
	__syncthreads();

	if (tid < 32)
	{
		volatile scalar_t* vsmem = smem;
		vsmem[tid] += vsmem[tid + 32];
		vsmem[tid] += vsmem[tid + 16];
		vsmem[tid] += vsmem[tid + 8];
		vsmem[tid] += vsmem[tid + 4];
		vsmem[tid] += vsmem[tid + 2];
		vsmem[tid] += vsmem[tid + 1];
	}
	
	if (tid == 0)
		A[blockIdx.x] = smem[0];
}


void reduceGmemLaunch()
{
	int* A = new int[len];
	for (int i = 0; i < len; ++i)
		A[i] = 1;
	int* A_cuda;
	CUDA_CHECK(cudaMalloc((void**)&A_cuda, size));
	CUDA_CHECK(cudaMemcpy(A_cuda, A, size, cudaMemcpyHostToDevice));
	dim3 threadSize(128);
	dim3 blockSize((len + threadSize.x - 1) / threadSize.x);
	reduceGmemKernel<<<blockSize, threadSize>>>(A_cuda);
	cudaError_t cudaStatus = cudaGetLastError();
	if (cudaStatus != cudaSuccess)
		fprintf(stderr, "reduceGmemLaunch launch failed: %s\n", cudaGetErrorString(cudaStatus));
	int* res = new int[blockSize.x];
	CUDA_CHECK(cudaMemcpy(res, A_cuda, sizeof(int) * blockSize.x, cudaMemcpyDeviceToHost));
	CUDA_CHECK(cudaFree(A_cuda));
	for (int i = 1; i < blockSize.x; ++i)
		res[0] += res[i];
	cout << "Gmem Result=" << res[0] << endl;
}

void reduceSmemLaunch()
{
	int* A = new int[len];
	for (int i = 0; i < len; ++i)
		A[i] = 1;
	int* A_cuda;
	CUDA_CHECK(cudaMalloc((void**)&A_cuda, size));
	CUDA_CHECK(cudaMemcpy(A_cuda, A, size, cudaMemcpyHostToDevice));
	dim3 threadSize(128);
	dim3 blockSize((len + threadSize.x - 1) / threadSize.x);
	reduceSmemKernel<<<blockSize, threadSize, threadSize.x * sizeof(int)>>>(A_cuda);
	cudaError_t cudaStatus = cudaGetLastError();
	if (cudaStatus != cudaSuccess)
		fprintf(stderr, "reduceSmemLaunch launch failed: %s\n", cudaGetErrorString(cudaStatus));
	int* res = new int[blockSize.x];
	CUDA_CHECK(cudaMemcpy(res, A_cuda, sizeof(int) * blockSize.x, cudaMemcpyDeviceToHost));
	CUDA_CHECK(cudaFree(A_cuda));
	for (int i = 1; i < blockSize.x; ++i)
		res[0] += res[i];
	cout << "Smem Result=" << res[0] << endl;
}

int main()
{
	reduceGmemLaunch();
	reduceSmemLaunch();
	return 0;
}

On volta and beyond, your code is broken for the reasons described here. (refer to listing 8, for example.) Your final reduction (at the warp level) not only requires volatile for memory ordering, but warp-synchronous behavior. i.e. it effectively requires an execution barrier at each step. But you haven’t provided that. Note that __syncwarp() provides both the execution barrier and memory ordering necessities.

on pre-volta, your code also has a race condition at each step in the final warp-level reduction. This could be eliminated by prefacing each line with e.g.:

	if (tid < 16) vsmem[tid] += vsmem[tid + 16];

For the shared memory variant, running your code with compute-sanitizer --tool racecheck ... may be of interest. As a practical matter (so you don’t have to wait a long time for results) reduce your problem size before doing that.

1 Like

Thanks for your reply, I modified my code according to the blog you mentioned. And there are 0 errors in compute-sanitizer --tool racecheck now. But the reduction kernel of global memory still output incorrect result if I fix the number of active threads in the final warp-level reduction (in order to avoid warp branch). This result really confuse me. Could you give me any further interpretation?

#include<cuda_runtime.h>
#include<iostream>

#define CUDA_CHECK(call)\
{\
const cudaError_t error = call;\
if(error != cudaSuccess)\
{\
printf("Error: %s:%d, ", __FILE__, __LINE__);\
printf("code:%d, reason: %s\n", error, cudaGetErrorString(error));\
exit(1);\
}\
}

using namespace std;

#define len int(1<<24)
#define size len * sizeof(int)
#define uint unsigned int

template<typename scalar_t>
__global__ void reduceGmemKernel(scalar_t* A)
{
	uint tid = threadIdx.x;
	scalar_t* idata = A + blockIdx.x * blockDim.x;

	uint idx = threadIdx.x + blockDim.x * blockIdx.x;
	if (idx >= len)
		return;

	scalar_t v = idata[tid];
	for (int stride = blockDim.x / 2; stride >= 32; stride >>= 1)
	{
		if(tid < stride) v += idata[tid + stride];
		__syncthreads();
		__threadfence();
		if (tid < stride) idata[tid] = v;
		__syncthreads();
		__threadfence();
	}

	/*---------------------------------------------------------------*/
        // output is correct in this case
	//if (tid < 32) v = idata[tid];	__syncwarp(); __threadfence();
	//for (int stride = 16; stride > 0; stride >>= 1)
	//{
	//	if (tid < stride) v += idata[tid + stride];	__syncwarp(); __threadfence();
	//	if (tid < stride) idata[tid] = v;			__syncwarp(); __threadfence();
	//}
	/*--------------------------------------------------------------*/
       // output is incorrect if I fix the number of active threads in the warp
	if (tid < 32) v = idata[tid];	__syncwarp(); __threadfence();
	for (int stride = 16; stride > 0; stride >>= 1)
	{
		if (tid < 32) v += idata[tid + stride];	__syncwarp(); __threadfence();
		if (tid < 32) idata[tid] = v;			__syncwarp(); __threadfence();
	}
	/*--------------------------------------------------------------*/
	if (tid == 0)
		A[blockIdx.x] = idata[0];
}

template<typename scalar_t>
__global__ void reduceSmemKernel(scalar_t* A)
{
	uint tid = threadIdx.x;
	uint idx = threadIdx.x + blockDim.x * blockIdx.x;
	
	extern __shared__ scalar_t smem[];

	if (idx >= len)
		return;

	smem[tid] = A[idx];
	__syncthreads();

	scalar_t v = smem[tid];
	for (int stride = blockDim.x / 2; stride >= 32; stride >>= 1)
	{
		if (tid < stride) v += smem[tid + stride];
		__syncthreads();
		__threadfence();
		if (tid < stride) smem[tid] = v;
		__syncthreads();
		__threadfence();
	}

	if (tid < 32)
	{
		v = smem[tid]; __syncwarp();
		for (int stride = 16; stride > 0; stride >>= 1)
		{
			v += smem[tid + stride];	__syncwarp();
			smem[tid] = v;				__syncwarp();
		}
	}
	
	if (tid == 0)
		A[blockIdx.x] = smem[0];
}


int reduceGmemLaunch()
{
	int* A = new int[len];
	for (int i = 0; i < len; ++i)
		A[i] = 1;
	int* A_cuda;
	CUDA_CHECK(cudaMalloc((void**)&A_cuda, size));
	CUDA_CHECK(cudaMemcpy(A_cuda, A, size, cudaMemcpyHostToDevice));
	dim3 blockSize(128);
	dim3 gridSize((len + blockSize.x - 1) / blockSize.x);
	reduceGmemKernel<<<gridSize, blockSize >>>(A_cuda);
	cudaError_t cudaStatus = cudaGetLastError();
	if (cudaStatus != cudaSuccess)
		fprintf(stderr, "reduceGmemLaunch launch failed: %s\n", cudaGetErrorString(cudaStatus));
	int* res = new int[gridSize.x];
	CUDA_CHECK(cudaMemcpy(res, A_cuda, sizeof(int) * gridSize.x, cudaMemcpyDeviceToHost));
	CUDA_CHECK(cudaFree(A_cuda));
	for (int i = 1; i < gridSize.x; ++i)
		res[0] += res[i];
	return res[0];
}

int reduceSmemLaunch()
{
	int* A = new int[len];
	for (int i = 0; i < len; ++i)
		A[i] = 1;
	int* A_cuda;
	CUDA_CHECK(cudaMalloc((void**)&A_cuda, size));
	CUDA_CHECK(cudaMemcpy(A_cuda, A, size, cudaMemcpyHostToDevice));
	dim3 blockSize(128);
	dim3 gridSize((len + blockSize.x - 1) / blockSize.x);
	reduceSmemKernel<<<gridSize, blockSize, blockSize.x * sizeof(int)>>>(A_cuda);
	cudaError_t cudaStatus = cudaGetLastError();
	if (cudaStatus != cudaSuccess)
		fprintf(stderr, "reduceSmemLaunch launch failed: %s\n", cudaGetErrorString(cudaStatus));
	int* res = new int[gridSize.x];
	CUDA_CHECK(cudaMemcpy(res, A_cuda, sizeof(int) * gridSize.x, cudaMemcpyDeviceToHost));
	CUDA_CHECK(cudaFree(A_cuda));
	for (int i = 1; i < gridSize.x; ++i)
		res[0] += res[i];
	return res[0];
}

int main()
{
	int Shmem_result, Gmem_result = 0;
	for (int i = 0; i < 10; ++i)
	{
		Gmem_result = reduceGmemLaunch();
		Shmem_result = reduceSmemLaunch();
               // print the result if it is incorrect
		if (Gmem_result != 16777216 || Shmem_result != 16777216)	
			cout << "Gmem=" << Gmem_result << "\t" 
				<< "Shmem=" << Shmem_result << endl;
	}
	return 0;
}

Don’t try to use the same array for both input and output. CUDA doesn’t provide any guarantee as to order of block execution. So this is broken:

if (tid == 0)
	A[blockIdx.x] = idata[0];

Your suggestion that your code is correct if you use:

if (tid < stride)

is not correct. I’m not going to respond to questions like “why does the shared memory version always give the correct answer?” This sort of input-output ordering obviously depends on how you use the input array exactly, the size of the data, and the GPU you are running on, which you have not indicated here. In any event, your use of the input array as the output is broken.

I should have noticed this first. When I make this change your original code works for me. I still recommend paying attention to the concerns raised in the blog.

1 Like

This is exactly the problem with my code. Your reply really solved my confuse. Thank you a lot.