thrust::sort_by_key has race conditions?

So I tried running my code under cuda-memcheck and it’s reporting all kinds of race conditions from my thrust call. Why is this?

#include <stdio.h>
#include <curand.h>
#include <curand_kernel.h>

#include <thrust/sort.h>
#include <thrust/device_ptr.h>

#define N 10

const int tpb = 256;
const int bpg = 512;

__global__
void find_boundaries(const int n, // number of elements
                     const int b, // number of buckets (unique points)
                     const int *pa, // pa is bucket id
                           int *ht) // hash table
{
    const int tid = threadIdx.x + blockIdx.x * blockDim.x;

    for (int i = tid; i < n; i += blockDim.x * gridDim.x)
    {
        const int prev = (i > 0 ? pa[i - 1] : 0);
        const int curr = pa[i];

        //printf("prev : %d; curr = %d; index = %d\n", prev, curr, i);

        if (i == 0)
        {
            for (int j = 0; j <= pa[0]; ++j)
            {
                ht[j] = 0;
            }

            return;
        }

        if (prev != curr)
        {
            for (int j = prev; j < curr; ++j)
            {
                ht[j + 1] = i;
            }
        }

        if (i == n - 1)
        {
            for (int j = curr; j < b; ++j)
            {
                ht[j + 1] = n;
            }
        }
    }
}

__global__ void setup_kernel (curandState *state, unsigned long seed)
{
    int tid = threadIdx.x + blockIdx.x * blockDim.x;

    for (int i = tid; i < N; i += blockDim.x * gridDim.x)
    {
        curand_init (seed, i, 0, &state[i]);
    }
} 

__global__ void generate(curandState* globalState, int *pa, int *ta ) 
{
    int tid = threadIdx.x + blockIdx.x * blockDim.x;

    for (int i = tid; i < N; i += blockDim.x * gridDim.x)
    {
        curandState localState = globalState[i];
        float a = curand_uniform(&localState);
        float b = curand_uniform(&localState);
        globalState[i] = localState;
        //printf("%f vs %f\n", a * 10, b * 10);

        pa[i] = (int ) (a * 10);
        ta[i] = (int ) (b * 10); 

        //printf("(%d, %d)\n", pa[i], ta[i]);
    }
}

int main( int argc, char** argv) 
{
    cudaDeviceReset();

    curandState *devStates;
    cudaMalloc(&devStates, N * sizeof(*devStates));
    
    int *pa = 0, *ta = 0, *ht = 0;
    cudaMallocManaged(&pa, N * sizeof(*pa));
    cudaMallocManaged(&ta, N * sizeof(*ta));
/*  // Static input
    pa[0] = pa[1] = pa[2] = 2;
    pa[3] = pa[4] = pa[5] = 3;
    pa[6] = 4;
    pa[7] = 5;
    pa[8] = 6;
    pa[9] = 6;

    for (int i = 0; i < N; ++i) ta[i] = i;
*/
    // setup seeds
    setup_kernel<<<bpg, tpb>>>(devStates, time(NULL));

    // generate random numbers
    generate<<<bpg, tpb>>>(devStates, pa, ta);

    thrust::sort_by_key(thrust::device_ptr<int>(pa), thrust::device_ptr<int>(pa + N), thrust::device_ptr<int>(ta));
    cudaDeviceSynchronize();

    int b = pa[N - 1] + 1; // number of buckets
    if (b < N)
        b += N;

    cudaMallocManaged(&ht, (b + 1)  * sizeof(*ht));

    find_boundaries<<<bpg, tpb>>>(N, b, pa, ht);
    cudaDeviceSynchronize();

    printf("ta : ");
    for (int i = 0; i < N; ++i)
    {
        printf("%d ", ta[i]);
    }
    printf("\npa : ");
    for (int i = 0; i < N; ++i)
    {
        printf("%d ", pa[i]);
    }

    printf("\n");
    printf("id : ");
    for (int i = 0; i < b + 1; ++i)
    {
        printf("%d ", i);
    }
    printf("\nht : ");
    for (int i = 0; i < b + 1; ++i)
    {
        printf("%d ", ht[i]);
    }
    printf("\n");

    cudaFree(ht);
    cudaFree(ta);
    cudaFree(pa);
    cudaFree(devStates);

    return 0;
}

Some output looks like this :

Race reported between Write access at 0x000007f8 in void thrust::system::cuda::detail::detail::b40c_thrust::SrtsScanSpine<void>(int*, thrust::system::cuda::detail::detail::b40c_thrust::SrtsScanSpine<void>, int)

I’m sorry to bump this thread but I’m having the same problem. Any guidelines?

Edit: Even when I run the sample with cuda-memcheck there are a lot of race conditions reported.