What I am doing is a bitonic merge, where one thread block performs the merge operation. I use the same optimization that is done in the bitonic merge sort in the CUDA SDK, i.e. perform the last steps of the merge in shared memory. But when I do this I get garbage. I have simplified the code so that it merges 2 sequences, their size being identical to that of the thread block. When I replace the shared memory array with one located in global memory everything works fine.
Does anyone see an error on my part? I’ve double- and tripple-checked the code but I can’t find a reason why the version using shared memory does not work, while the other one does. I’ve tested this on a Geforce 9800 with CUDA 2.3 and on a Tesla c1070 with the CUDA 3.0 beta.
A repro-case, which I tried to simplify as much as possible:
#include <iostream>
// Example demonstrating an error/bug(?) when using shared memory. The bitonic merge
// uses one thread block to merge 2 sequences that have the same size as the block,
// it works when performed in global memory but not in shared memory. The code is pretty
// much like the one in the CUDA SDK, but simplified.
template<unsigned int sharedStride> // == blockDim.x
__global__ static void BitonicMerge(unsigned int *keys, unsigned int *sharedKeys)
{
sharedKeys[threadIdx.x] = keys[threadIdx.x];
sharedKeys[threadIdx.x + sharedStride] = keys[threadIdx.x + sharedStride];
for (unsigned int stride = sharedStride; stride > 0; stride >>= 1)
{
__syncthreads();
unsigned int pos = (threadIdx.x << 1) - (threadIdx.x & (stride - 1));
unsigned int keyA = sharedKeys[pos];
unsigned int keyB = sharedKeys[pos + stride];
if (keyB < keyA)
{
sharedKeys[pos] = keyB;
sharedKeys[pos + stride] = keyA;
}
}
__syncthreads();
keys[threadIdx.x] = sharedKeys[threadIdx.x];
keys[threadIdx.x + sharedStride] = sharedKeys[threadIdx.x + sharedStride];
}
template<unsigned int sharedStride> // == blockDim.x
__global__ static void BitonicMergeShared(unsigned int *keys)
{
unsigned int sharedKeys[sharedStride << 1];
sharedKeys[threadIdx.x] = keys[threadIdx.x];
sharedKeys[threadIdx.x + sharedStride] = keys[threadIdx.x + sharedStride];
for (unsigned int stride = sharedStride; stride > 0; stride >>= 1)
{
__syncthreads();
unsigned int pos = (threadIdx.x << 1) - (threadIdx.x & (stride - 1));
unsigned int keyA = sharedKeys[pos];
unsigned int keyB = sharedKeys[pos + stride];
if (keyB < keyA)
{
sharedKeys[pos] = keyB;
sharedKeys[pos + stride] = keyA;
}
}
__syncthreads();
keys[threadIdx.x] = sharedKeys[threadIdx.x];
keys[threadIdx.x + sharedStride] = sharedKeys[threadIdx.x + sharedStride];
}
int main()
{
const unsigned int blockSize = 32;
unsigned int *input = new unsigned int[blockSize * 2];
unsigned int *output = new unsigned int[blockSize * 2];
// Fill the input with a bitonic sequence (first half sorted ascending, second half descending).
for (unsigned int i = 0; i < blockSize; ++i)
input[i] = i;
for (unsigned int i = blockSize; i < blockSize * 2; ++i)
input[i] = blockSize * 2 - i;
std::cout << std::endl << "Input" << std::endl;
for (int i = 0; i < blockSize * 2; ++i) std::cout << input[i] << " ";
unsigned int *dInput = 0;
unsigned int *dShared = 0;
unsigned int *dInputShared = 0;
// Bitonic merge in global memory works fine.
cudaMalloc((void**)&dInput, blockSize * 2 * sizeof(unsigned int));
cudaMalloc((void**)&dShared, blockSize * 2 * sizeof(unsigned int));
cudaMemcpy(dInput, input, blockSize * 2 * sizeof(unsigned int),
cudaMemcpyHostToDevice);
BitonicMerge<blockSize><<<1, blockSize>>>(dInput, dShared);
cudaMemcpy(output, dInput, blockSize * 2 * sizeof(unsigned int),
cudaMemcpyDeviceToHost);
std::cout << std::endl << std::endl << "Merging in global memory.." << std::endl;
for (int i = 0; i < blockSize * 2; ++i) std::cout << output[i] << " ";
// In shared memory it does not.
cudaMalloc((void**)&dInputShared, blockSize * 2 * sizeof(unsigned int));
cudaMemcpy(dInputShared, input, blockSize * 2 * sizeof(unsigned int),
cudaMemcpyHostToDevice);
BitonicMergeShared<blockSize><<<1, blockSize>>>(dInputShared);
cudaMemcpy(output, dInputShared, blockSize * 2 * sizeof(unsigned int),
cudaMemcpyDeviceToHost);
std::cout << std::endl << std::endl << "Merging in shared memory.." << std::endl;
for (int i = 0; i < blockSize * 2; ++i) std::cout << output[i] << " ";
cudaFree(dInput);
cudaFree(dShared);
cudaFree(dInputShared);
delete [] input;
delete [] output;
}