So I tried running my code under cuda-memcheck and it’s reporting all kinds of race conditions from my thrust call. Why is this?
#include <stdio.h>
#include <curand.h>
#include <curand_kernel.h>
#include <thrust/sort.h>
#include <thrust/device_ptr.h>
#define N 10
const int tpb = 256;
const int bpg = 512;
__global__
void find_boundaries(const int n, // number of elements
const int b, // number of buckets (unique points)
const int *pa, // pa is bucket id
int *ht) // hash table
{
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
for (int i = tid; i < n; i += blockDim.x * gridDim.x)
{
const int prev = (i > 0 ? pa[i - 1] : 0);
const int curr = pa[i];
//printf("prev : %d; curr = %d; index = %d\n", prev, curr, i);
if (i == 0)
{
for (int j = 0; j <= pa[0]; ++j)
{
ht[j] = 0;
}
return;
}
if (prev != curr)
{
for (int j = prev; j < curr; ++j)
{
ht[j + 1] = i;
}
}
if (i == n - 1)
{
for (int j = curr; j < b; ++j)
{
ht[j + 1] = n;
}
}
}
}
__global__ void setup_kernel (curandState *state, unsigned long seed)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
for (int i = tid; i < N; i += blockDim.x * gridDim.x)
{
curand_init (seed, i, 0, &state[i]);
}
}
__global__ void generate(curandState* globalState, int *pa, int *ta )
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
for (int i = tid; i < N; i += blockDim.x * gridDim.x)
{
curandState localState = globalState[i];
float a = curand_uniform(&localState);
float b = curand_uniform(&localState);
globalState[i] = localState;
//printf("%f vs %f\n", a * 10, b * 10);
pa[i] = (int ) (a * 10);
ta[i] = (int ) (b * 10);
//printf("(%d, %d)\n", pa[i], ta[i]);
}
}
int main( int argc, char** argv)
{
cudaDeviceReset();
curandState *devStates;
cudaMalloc(&devStates, N * sizeof(*devStates));
int *pa = 0, *ta = 0, *ht = 0;
cudaMallocManaged(&pa, N * sizeof(*pa));
cudaMallocManaged(&ta, N * sizeof(*ta));
/* // Static input
pa[0] = pa[1] = pa[2] = 2;
pa[3] = pa[4] = pa[5] = 3;
pa[6] = 4;
pa[7] = 5;
pa[8] = 6;
pa[9] = 6;
for (int i = 0; i < N; ++i) ta[i] = i;
*/
// setup seeds
setup_kernel<<<bpg, tpb>>>(devStates, time(NULL));
// generate random numbers
generate<<<bpg, tpb>>>(devStates, pa, ta);
thrust::sort_by_key(thrust::device_ptr<int>(pa), thrust::device_ptr<int>(pa + N), thrust::device_ptr<int>(ta));
cudaDeviceSynchronize();
int b = pa[N - 1] + 1; // number of buckets
if (b < N)
b += N;
cudaMallocManaged(&ht, (b + 1) * sizeof(*ht));
find_boundaries<<<bpg, tpb>>>(N, b, pa, ht);
cudaDeviceSynchronize();
printf("ta : ");
for (int i = 0; i < N; ++i)
{
printf("%d ", ta[i]);
}
printf("\npa : ");
for (int i = 0; i < N; ++i)
{
printf("%d ", pa[i]);
}
printf("\n");
printf("id : ");
for (int i = 0; i < b + 1; ++i)
{
printf("%d ", i);
}
printf("\nht : ");
for (int i = 0; i < b + 1; ++i)
{
printf("%d ", ht[i]);
}
printf("\n");
cudaFree(ht);
cudaFree(ta);
cudaFree(pa);
cudaFree(devStates);
return 0;
}
Some output looks like this :
Race reported between Write access at 0x000007f8 in void thrust::system::cuda::detail::detail::b40c_thrust::SrtsScanSpine<void>(int*, thrust::system::cuda::detail::detail::b40c_thrust::SrtsScanSpine<void>, int)