combining CUB and thrust crashes Mac!

I’m trying to introduce some CUB into my “old” thrust code, and so have started with a small example to compare thrust::reduce_by_key with cub::DeviceReduce::ReduceByKey, both applied to thrust::device_vectors. The thrust part of the code is fine, but the CUB part crashes the code with a memory error:
libc++abi.dylib: terminate called throwing an exception
Abort trap: 6
If I try to access the contents of the device_vectors after the call to CUB, it crashes my Mac, which is making it hard to debug. Any help would be appreciated! The complete code is below. I have CUDA 6.5 and cub 1.3.2.

Thanks!
Sean

#include
#include <thrust/sort.h>
#include <thrust/gather.h>
#include <thrust/device_vector.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/iterator/permutation_iterator.h>
#include <thrust/iterator/discard_iterator.h>

#include <cub/cub.cuh> // or equivalently <cub/device/device_radix_sort.cuh>

//========================================
// for CUB:
struct CustomSum
{
template
CUB_RUNTIME_FUNCTION host device forceinline
//host device forceinline
T operator()(const T &a, const T &b) const {
return b+a;
}
};
//========================================

int main()
{
const int Nkey=20;
int Nseg=9;
int ikey[Nkey] = {0, 0, 0, 6, 8, 0, 2, 4, 6, 8, 1, 3, 5, 7, 8, 1, 3, 5, 7, 8};

thrust::device_vector key(ikey,ikey+Nkey);
thrust::device_vector keysout(Nkey);

// Let’s reduce x, by key:

float xval[Nkey];
for (int i=0; i<Nkey; i++) xval[i]=ikey[i]+0.1f;

thrust::device_vector x(xval,xval+Nkey);

// First, sort x by key:

thrust::sort_by_key(key.begin(),key.end(),x.begin());

//---------------------------------------------------------------------
std::cout<<"=================================================================="<<std::endl
<<" THRUST reduce_by_key:"<<std::endl
<<"=================================================================="<<std::endl;

thrust::device_vector output(Nseg,0.0f);

thrust::reduce_by_key(key.begin(),
key.end(),
x.begin(),
keysout.begin(),
output.begin());

for (int i=0;i<Nkey;i++) std::cout << x[i] <<" “; std::cout<<std::endl;
for (int i=0;i<Nkey;i++) std::cout << key[i] <<” “; std::cout<<std::endl;
for (int i=0;i<Nseg;i++) std::cout << output[i] <<” "; std::cout<<std::endl;

float ototal=thrust::reduce(output.begin(),output.end());
float xtotal=thrust::reduce(x.begin(),x.end());
std::cout << “total=”<< ototal <<", should be "<<xtotal<<std::endl;

//---------------------------------------------------------------------
std::cout<<"=================================================================="<<std::endl
<<" CUB ReduceByKey:"<<std::endl
<<"=================================================================="<<std::endl;

unsigned int *d_keys_in =thrust::raw_pointer_cast(&key[0]);
float *d_values_in =thrust::raw_pointer_cast(&x[0]);
unsigned int *d_keys_out =thrust::raw_pointer_cast(&keysout[0]);
float *d_values_out=thrust::raw_pointer_cast(&output[0]);
int *d_num_segments=&Nseg;
CustomSum reduction_op;

std::cout << “CUB input” << std::endl;
for (int i=0; i<Nkey; ++i) std::cout << key[i] << " "; std::cout<<std::endl;
for (int i=0; i<Nkey; ++i) std::cout << x[i] << " "; std::cout<< std::endl;
for (int i=0; i<Nkey; ++i) std::cout << keysout[i] << " "; std::cout<< std::endl;
for (int i=0; i<Nseg; ++i) std::cout << output[i] << " "; std::cout<< std::endl;

// Determine temporary device storage requirements
void *d_temp_storage = NULL;
size_t temp_storage_bytes = 0;
cub::DeviceReduce::ReduceByKey(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, d_num_segments, reduction_op, Nkey);

// Allocate temporary storage
cudaMalloc(&d_temp_storage, temp_storage_bytes);
std::cout << "temp_storage_bytes = " << temp_storage_bytes << std::endl;

// Run reduce-by-key
cub::DeviceReduce::ReduceByKey(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, d_num_segments, reduction_op, Nkey);

std::cout << “CUB output” << std::endl;

std::cout<<Nkey<<" "<<Nseg<<std::endl;
std::cout<<key.size() << " "<<x.size() << " "<<keysout.size() << " "<<output.size() << std::endl;

// At this point onward it dies:
//libc++abi.dylib: terminate called throwing an exception
//Abort trap: 6

// If the next line is uncommented, it crashes the Mac!
for (int i=0; i<Nkey; ++i) std::cout << key[i] << " "; std::cout<<std::endl;
// for (int i=0; i<Nkey; ++i) std::cout << x[i] << " "; std::cout<< std::endl;
// for (int i=0; i<Nkey; ++i) std::cout << keysout[i] << " "; std::cout<< std::endl;
// for (int i=0; i<Nseg; ++i) std::cout << output[i] << " "; std::cout<< std::endl;
cudaFree(d_temp_storage);

ototal=thrust::reduce(output.begin(),output.end());
xtotal=thrust::reduce(x.begin(),x.end());
std::cout << “total=”<< ototal <<", should be "<<xtotal<<std::endl;
return 1;
}

The intent of this line of code is illegal:

int *d_num_segments=&Nseg;

Clearly you intend the pointer d_num_segments to be usable on the device, because you are passing it to cub::DeviceReduce:

cub::DeviceReduce::ReduceByKey(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, d_num_segments, reduction_op, Nkey);

However this is not a valid way to create a device pointer. You cannot take the address of a host variable (Nseg) and use it as a device pointer. When cub attempts to dereference this pointer, you get an illegal access.

If you replace this line of code:

int *d_num_segments=&Nseg;

with this:

int *d_num_segments;
cudaMalloc(&d_num_segments, sizeof(int));

I believe you’ll have better results.

Thank you very much. That was the problem.