Error/bug(?) in bitonic merge, related to shared memory usage.

What I am doing is a bitonic merge, where one thread block performs the merge operation. I use the same optimization that is done in the bitonic merge sort in the CUDA SDK, i.e. perform the last steps of the merge in shared memory. But when I do this I get garbage. I have simplified the code so that it merges 2 sequences, their size being identical to that of the thread block. When I replace the shared memory array with one located in global memory everything works fine.

Does anyone see an error on my part? I’ve double- and tripple-checked the code but I can’t find a reason why the version using shared memory does not work, while the other one does. I’ve tested this on a Geforce 9800 with CUDA 2.3 and on a Tesla c1070 with the CUDA 3.0 beta.

A repro-case, which I tried to simplify as much as possible:

#include <iostream>	

// Example demonstrating an error/bug(?) when using shared memory. The bitonic merge

// uses one thread block to merge 2 sequences that have the same size as the block, 

// it works when performed in global memory but not in shared memory. The code is pretty

// much like the one in the CUDA SDK, but simplified.

template<unsigned int sharedStride> // == blockDim.x

__global__ static void BitonicMerge(unsigned int *keys, unsigned int *sharedKeys)	

{

  sharedKeys[threadIdx.x] = keys[threadIdx.x];

  sharedKeys[threadIdx.x + sharedStride] = keys[threadIdx.x + sharedStride];

for (unsigned int stride = sharedStride; stride > 0; stride >>= 1)

  {

	__syncthreads();

	unsigned int pos = (threadIdx.x << 1) - (threadIdx.x & (stride - 1));

	unsigned int keyA = sharedKeys[pos];

	unsigned int keyB = sharedKeys[pos + stride];	

	if (keyB < keyA)

	{

	  sharedKeys[pos] = keyB;

	  sharedKeys[pos + stride] = keyA;

	}

  }

__syncthreads();

  keys[threadIdx.x] = sharedKeys[threadIdx.x];

  keys[threadIdx.x + sharedStride] = sharedKeys[threadIdx.x + sharedStride];

}

template<unsigned int sharedStride> // == blockDim.x

__global__ static void BitonicMergeShared(unsigned int *keys)	

{

  unsigned int sharedKeys[sharedStride << 1];

sharedKeys[threadIdx.x] = keys[threadIdx.x];

  sharedKeys[threadIdx.x + sharedStride] = keys[threadIdx.x + sharedStride];

for (unsigned int stride = sharedStride; stride > 0; stride >>= 1)

  {

	__syncthreads();

	unsigned int pos = (threadIdx.x << 1) - (threadIdx.x & (stride - 1));

	unsigned int keyA = sharedKeys[pos];

	unsigned int keyB = sharedKeys[pos + stride];	

	if (keyB < keyA)

	{

	  sharedKeys[pos] = keyB;

	  sharedKeys[pos + stride] = keyA;

	}

  }

__syncthreads();

  keys[threadIdx.x] = sharedKeys[threadIdx.x];

  keys[threadIdx.x + sharedStride] = sharedKeys[threadIdx.x + sharedStride];

}

int main()

{	

  const unsigned int blockSize = 32;

  unsigned int *input = new unsigned int[blockSize * 2];

  unsigned int *output = new unsigned int[blockSize * 2];

// Fill the input with a bitonic sequence (first half sorted ascending, second half descending).

  for (unsigned int i = 0; i < blockSize; ++i)

	input[i] = i;

  for (unsigned int i = blockSize; i < blockSize * 2; ++i)

	input[i] = blockSize * 2 - i;

std::cout << std::endl << "Input" << std::endl;

  for (int i = 0; i < blockSize * 2; ++i) std::cout << input[i] << " ";

unsigned int *dInput = 0;

  unsigned int *dShared = 0;

  unsigned int *dInputShared = 0;

// Bitonic merge in global memory works fine.

  cudaMalloc((void**)&dInput, blockSize * 2 * sizeof(unsigned int));

  cudaMalloc((void**)&dShared, blockSize * 2 * sizeof(unsigned int));		

  cudaMemcpy(dInput, input, blockSize * 2 * sizeof(unsigned int), 

	cudaMemcpyHostToDevice);	

BitonicMerge<blockSize><<<1, blockSize>>>(dInput, dShared);

cudaMemcpy(output, dInput, blockSize * 2 * sizeof(unsigned int), 

	cudaMemcpyDeviceToHost);	

std::cout << std::endl << std::endl << "Merging in global memory.." << std::endl;

  for (int i = 0; i < blockSize * 2; ++i) std::cout << output[i] << " ";

// In shared memory it does not.

  cudaMalloc((void**)&dInputShared, blockSize * 2 * sizeof(unsigned int));	

  cudaMemcpy(dInputShared, input, blockSize * 2 * sizeof(unsigned int), 

	cudaMemcpyHostToDevice);

BitonicMergeShared<blockSize><<<1, blockSize>>>(dInputShared);

cudaMemcpy(output, dInputShared, blockSize * 2 * sizeof(unsigned int), 

	cudaMemcpyDeviceToHost);	

std::cout << std::endl << std::endl << "Merging in shared memory.." << std::endl;

  for (int i = 0; i < blockSize * 2; ++i) std::cout << output[i] << " ";	

cudaFree(dInput);

  cudaFree(dShared);

  cudaFree(dInputShared);

delete [] input;	

  delete [] output;	

}

Sorry to disappoint, you’re using local memory, not shared memory because you clearly
did not use any shared keyword in the code.

So each thread sees its very own local copy of the array and that can’t work for sure.

Christian

Oh my, thanks for pointing out that stupid error :) The most severe case of code blindness I’ve had in quite a while…