Thanks txbob. I’ve cobbled together another a small program that uses cib::DeviceRadixSort::SortPairs() with the begin and end bits values that you recommend. In this case all the numbers in the key array are in the range [1,20] inclusive. Unfortunately the result of the sort is incorrect when begin/end bit is set as recommended.
BTW, I’m using the latest version of cub (1.7.4).
#include <iostream>
#include <cub/cub.cuh>
// Helper Functions Decl
void print( short* data , unsigned size , const char* name );
void allocateDeviceMemory( void* devPtr , unsigned size , int lineNumber );
void copyDataToHost( void* hostPtr , void* devPtr , unsigned size , int lineNumber );
void copyDataToDevice( void* devPtr , void* hostPtr , unsigned size , int lineNumber );
#define CHECK_ERROR( error ) \
if( error != cudaSuccess ) \
{ \
std::string msg( "[Error] " ); \
msg += cudaGetErrorString( error ); \
std::cerr << msg << std::endl; \
exit(-1); \
}
////////////////////////////////////////////////////////////////////////////////////////////////
int main(int argc, char** argv)
{
cudaError_t status;
void* tmpStorage = 0;
size_t tmpStorageSize = 0;
static const unsigned COUNT = 20;
short h_values[COUNT] = { 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0};
short h_keys[COUNT] = { 17, 11, 16, 3, 10, 13, 1, 17, 14, 15, 18, 15, 10, 4, 8, 11, 15, 20, 10, 18 };
short* d_keys = 0;
short* d_values = 0;
unsigned dataSize = COUNT * sizeof(short);
allocateDeviceMemory( &d_keys , dataSize , __LINE__ );
allocateDeviceMemory( &d_values , dataSize , __LINE__ );
copyDataToDevice( d_keys , h_keys , dataSize , __LINE__ );
copyDataToDevice( d_values , h_values , dataSize , __LINE__ );
print( h_keys , COUNT , "Pre-sort Keys" );
print( h_values , COUNT , "Pre-sort Values" );
status = cub::DeviceRadixSort::SortPairs( tmpStorage , tmpStorageSize , d_keys , d_keys , d_values , d_values , COUNT ); // Correct
//status = cub::DeviceRadixSort::SortPairs( tmpStorage , tmpStorageSize , d_keys , d_keys , d_values , d_values , COUNT , 0 , 4 ); // BUG
CHECK_ERROR( status );
allocateDeviceMemory( &tmpStorage , tmpStorageSize , __LINE__ );
status = cub::DeviceRadixSort::SortPairs( tmpStorage , tmpStorageSize , d_keys , d_keys , d_values , d_values , COUNT ); // Correct
//status = cub::DeviceRadixSort::SortPairs( tmpStorage , tmpStorageSize , d_keys , d_keys , d_values , d_values , COUNT , 0 , 4 ); // BUG
CHECK_ERROR( status );
copyDataToHost( h_keys , d_keys , dataSize , __LINE__ );
copyDataToHost( h_values , d_values , dataSize , __LINE__ );
print( h_keys , COUNT , "Post-sort Keys" );
print( h_values , COUNT , "Post-sort Values" );
return 0;
}
////////////////////////////////////////////////////////////////////////////////////////////////
// Helper Functions Impl
void print( short* data , unsigned size , const char* name )
{
std::cout << "[" << name << "]\n[ " << data[0];
for( unsigned i = 1; i < size; ++i )
{
std::cout << " , " << data[i];
}
std::cout << " ]" << std::endl;
}
void allocateDeviceMemory( void* devPtr , unsigned size , int lineNumber )
{
cudaError_t error = cudaMalloc( (void**) devPtr , size );
if( error != cudaSuccess )
{
std::cout << "[Line " << lineNumber << " -- Error " << error << " : Unable to allocate device memory] " << cudaGetErrorString( error ) << std::endl;
exit(-1);
}
}
void copyDataToHost( void* hostPtr , void* devPtr , unsigned size , int lineNumber )
{
cudaError_t error = cudaMemcpy( hostPtr , devPtr , size , cudaMemcpyDeviceToHost );
if( error != cudaSuccess )
{
std::cout << "[Line " << lineNumber << " -- Error " << error << " : Unable to copy device data to host memory] " << cudaGetErrorString( error ) << std::endl;
exit(-1);
}
}
void copyDataToDevice( void* devPtr , void* hostPtr , unsigned size , int lineNumber )
{
cudaError_t error = cudaMemcpy( devPtr , hostPtr , size , cudaMemcpyHostToDevice );
if( error != cudaSuccess )
{
std::cout << "[Line " << lineNumber << " -- Error " << error << " : Unable to copy host data to device memory] " << cudaGetErrorString( error ) << std::endl;
exit(-1);
}
}