Here’s a version of your code with all CUDA elements stripped out. It demonstrates that your lambda capture mechanism is not working the way you expect:
$ cat t1348.cpp
#include <stdio.h>
#include <vector>
#include <thread>
using namespace std;
void modify(uint32_t * d_keys_in, int N, uint32_t d)
{
int threads = 1024;
int blocks = (N + threads - 1) / threads;
}
class Partition
{
public:
int N;
const int m_deviceId;
uint32_t * d_keys_in = NULL;
uint32_t * d_vals_in = NULL;
uint32_t * d_keys_out = NULL;
uint32_t * d_vals_out = NULL;
void * d_temp = NULL;
size_t temp_size = 0;
Partition(int deviceId, int N, uint32_t * keys, uint32_t * vals) : m_deviceId(deviceId), N(N)
{
}
void Sort(int i)
{
printf("N = %d, device_id = %d\n", N, m_deviceId);
modify(d_keys_in, N, i + m_deviceId);
}
void Release()
{
}
};
int main()
{
const int N = 2 * 1000 * 1000;
srand(time(NULL));
uint32_t * keys_in = NULL, * vals_in = NULL, * keys_out = NULL, * vals_out = NULL;
keys_in = new uint32_t[N];
vals_in = new uint32_t[N];
keys_out = new uint32_t[N];
vals_out = new uint32_t[N];
for (uint32_t i = 0; i < N; i++)
{
keys_in[i] = rand() & 255;
vals_in[i] = i;
}
vector<Partition> ps;
for (int i = 0; i < 2; i++)
{
ps.push_back({ i, N, keys_in, vals_in });
}
thread * threads = new thread[2];
for (uint32_t i = 0; i < 1; i++)
{
for (size_t threadId = 0; threadId < 2; threadId++)
{
#ifndef FIX
auto f = [&]() {
#else
auto f = [&ps, threadId, i]() {
#endif
ps[threadId].Sort(i);
};
threads[threadId] = thread(f);
}
for (size_t threadId = 0; threadId < 2; threadId++)
{
threads[threadId].join();
}
}
for (int i = 0; i < 2; i++)
{
ps[i].Release();
}
if (keys_in) delete[] keys_in;
if (vals_in) delete[] vals_in;
if (keys_out) delete[] keys_out;
if (vals_out) delete[] vals_out;
if (threads) delete[] threads;
return 0;
}
$ g++ t1348.cpp -std=c++11 -o t1348cpu -pthread
$ ./t1348cpu
N = 0, device_id = 0
N = 0, device_id = 0
[user2@dc10 misc]$ g++ t1348.cpp -std=c++11 -o t1348cpu -pthread -DFIX
$ ./t1348cpu
N = 2000000, device_id = 0
N = 2000000, device_id = 1
$
(g++ 4.8.5)
The takeaway according to my testing is that this has nothing to do with CUB or CUDA, but with a broken threading methodology. The theory I have to explain this is roughly described here:
https://stackoverflow.com/questions/36325039/starting-c11-thread-with-a-lambda-capturing-local-variable
When you capture variables by reference in a lambda, and use that lambda as a thread function, the resolution of those (capture-by-reference) variables does not actually occur until the thread starts executing - asynchronously - sometime later. Since threadId is your loop variable, it seems fairly evident that capturing this by reference is a bad idea - the loop iterates twice (at least) before the threads actually begin executing, so the threadId variable may get updated via the loop iteration, before the threads consume it. According to my testing, it is sufficient to do this:
auto f = [&, threadId]() {
to work around this problem, which I believe is consistent with my description. For additional supporting evidence, we can modify our test case slightly to have Sort() accept an additional integer parameter and print it out. We will pass threadId to it:
$ cat t1348.cpp
#include <stdio.h>
#include <vector>
#include <thread>
using namespace std;
void modify(uint32_t * d_keys_in, int N, uint32_t d)
{
int threads = 1024;
int blocks = (N + threads - 1) / threads;
}
class Partition
{
public:
int N;
const int m_deviceId;
uint32_t * d_keys_in = NULL;
uint32_t * d_vals_in = NULL;
uint32_t * d_keys_out = NULL;
uint32_t * d_vals_out = NULL;
void * d_temp = NULL;
size_t temp_size = 0;
Partition(int deviceId, int N, uint32_t * keys, uint32_t * vals) : m_deviceId(deviceId), N(N)
{
}
void Sort(int i, int tID)
{
printf("N = %d, device_id = %d, tID = %d\n", N, m_deviceId, tID);
modify(d_keys_in, N, i + m_deviceId);
}
void Release()
{
}
};
int main()
{
const int N = 2 * 1000 * 1000;
srand(time(NULL));
uint32_t * keys_in = NULL, * vals_in = NULL, * keys_out = NULL, * vals_out = NULL;
keys_in = new uint32_t[N];
vals_in = new uint32_t[N];
keys_out = new uint32_t[N];
vals_out = new uint32_t[N];
for (uint32_t i = 0; i < N; i++)
{
keys_in[i] = rand() & 255;
vals_in[i] = i;
}
vector<Partition> ps;
for (int i = 0; i < 2; i++)
{
ps.push_back({ i, N, keys_in, vals_in });
}
thread * threads = new thread[2];
for (uint32_t i = 0; i < 1; i++)
{
for (size_t threadId = 0; threadId < 2; threadId++)
{
#ifndef FIX
auto f = [&]() {
#else
auto f = [&, threadId]() {
#endif
ps[threadId].Sort(i,threadId);
};
threads[threadId] = thread(f);
}
for (size_t threadId = 0; threadId < 2; threadId++)
{
threads[threadId].join();
}
}
for (int i = 0; i < 2; i++)
{
ps[i].Release();
}
if (keys_in) delete[] keys_in;
if (vals_in) delete[] vals_in;
if (keys_out) delete[] keys_out;
if (vals_out) delete[] vals_out;
if (threads) delete[] threads;
return 0;
}
$ g++ t1348.cpp -std=c++11 -o t1348cpu -pthread
$ ./t1348cpu
N = 0, device_id = 0, tID = 2
N = 0, device_id = 0, tID = 2
$ g++ t1348.cpp -std=c++11 -o t1348cpu -pthread -DFIX
$ ./t1348cpu
N = 2000000, device_id = 1, tID = 1
N = 2000000, device_id = 0, tID = 0
$
We now see that in the failing case, the passed threadId value has the illegal index of 2 (for both threads, since the loop has actually updated the loop variable twice, by the time either thread actually starts executing), whereas in the passing case, with the capture-by-value of threadId, we get the expected results.
As a final test, when I take a suitably modified version of your original code:
- add necessary headers and preamble
- remove references to GPUStopWatch
- reduce main loop count to a sane number
- modify the lambda to capture threadId by value
your code runs without any runtime or CUDA API error for me:
$ cat t1348.cu
#include <cub/cub.cuh>
#include <stdio.h>
#include <helper_cuda.h>
#include <vector>
#include <thread>
using namespace std;
__global__ void gpu_modify(uint32_t * d_keys_in, int N, uint32_t d)
{
int i = threadIdx.x + blockIdx.x * blockDim.x;
if (i < N)
{
d_keys_in[i] = (d_keys_in[i] + d) & 255;
}
}
void modify(uint32_t * d_keys_in, int N, uint32_t d)
{
int threads = 1024;
int blocks = (N + threads - 1) / threads;
gpu_modify << <blocks, threads >> > (d_keys_in, N, d);
}
class Partition
{
public:
int N;
const int m_deviceId;
uint32_t * d_keys_in = NULL;
uint32_t * d_vals_in = NULL;
uint32_t * d_keys_out = NULL;
uint32_t * d_vals_out = NULL;
void * d_temp = NULL;
size_t temp_size = 0;
Partition(int deviceId, int N, uint32_t * keys, uint32_t * vals) : m_deviceId(deviceId), N(N)
{
checkCudaErrors(cudaSetDevice(deviceId));
checkCudaErrors(cudaMalloc((void **)&d_keys_in, N * sizeof(uint32_t)));
checkCudaErrors(cudaMalloc((void **)&d_vals_in, N * sizeof(uint32_t)));
checkCudaErrors(cudaMalloc((void **)&d_keys_out, N * sizeof(uint32_t)));
checkCudaErrors(cudaMalloc((void **)&d_vals_out, N * sizeof(uint32_t)));
checkCudaErrors(cudaMemcpy(d_keys_in, keys, N * sizeof(uint32_t), cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpy(d_vals_in, vals, N * sizeof(uint32_t), cudaMemcpyHostToDevice));
cub::DeviceRadixSort::SortPairs(d_temp, temp_size, d_keys_in, d_keys_out, d_vals_in, d_vals_out, N);
checkCudaErrors(cudaGetLastError());
checkCudaErrors(cudaMalloc(&d_temp, temp_size));
}
void Sort(int i)
{
checkCudaErrors(cudaSetDevice(m_deviceId));
modify(d_keys_in, N, i + m_deviceId);
checkCudaErrors(cudaGetLastError());
cub::DeviceRadixSort::SortPairs(d_temp, temp_size, d_keys_in, d_keys_out, d_vals_in, d_vals_out, N);
checkCudaErrors(cudaGetLastError());
}
void Release()
{
checkCudaErrors(cudaSetDevice(m_deviceId));
if (d_keys_in) checkCudaErrors(cudaFree(d_keys_in));
if (d_vals_in) checkCudaErrors(cudaFree(d_vals_in));
if (d_keys_out) checkCudaErrors(cudaFree(d_keys_out));
if (d_vals_out) checkCudaErrors(cudaFree(d_vals_out));
if (d_temp) checkCudaErrors(cudaFree(d_temp));
}
};
int main()
{
const int N = 2 * 1000 * 1000;
srand(time(NULL));
uint32_t * keys_in = NULL, * vals_in = NULL, * keys_out = NULL, * vals_out = NULL;
keys_in = new uint32_t[N];
vals_in = new uint32_t[N];
keys_out = new uint32_t[N];
vals_out = new uint32_t[N];
for (uint32_t i = 0; i < N; i++)
{
keys_in[i] = rand() & 255;
vals_in[i] = i;
}
checkCudaErrors(cudaSetDevice(0));
vector<Partition> ps;
for (int i = 0; i < 2; i++)
{
ps.push_back({ i, N, keys_in, vals_in });
}
thread * threads = new thread[2];
for (uint32_t i = 0; i < 10; i++)
{
checkCudaErrors(cudaSetDevice(0));
for (size_t threadId = 0; threadId < 2; threadId++)
{
auto f = [&, threadId]() {
ps[threadId].Sort(i);
};
threads[threadId] = thread(f);
}
for (size_t threadId = 0; threadId < 2; threadId++)
{
threads[threadId].join();
}
for (size_t threadId = 0; threadId < 2; threadId++)
{
checkCudaErrors(cudaSetDevice(ps[threadId].m_deviceId));
checkCudaErrors(cudaMemcpy(keys_out + 2 * threadId, ps[threadId].d_keys_out, 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost));
checkCudaErrors(cudaMemcpy(vals_out + 2 * threadId, ps[threadId].d_vals_out, 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost));
}
checkCudaErrors(cudaSetDevice(0));
printf("[%d] %d %d %d %d %d %d %d %d\n", i, keys_out[0], keys_out[1], vals_out[0], vals_out[1], keys_out[2], keys_out[3], vals_out[2], vals_out[3]);
}
for (int i = 0; i < 2; i++)
{
ps[i].Release();
}
if (keys_in) delete[] keys_in;
if (vals_in) delete[] vals_in;
if (keys_out) delete[] keys_out;
if (vals_out) delete[] vals_out;
if (threads) delete[] threads;
checkCudaErrors(cudaDeviceReset());
return 0;
}
$ nvcc -arch=sm_35 -std=c++11 -o t1348 t1348.cu -I/usr/local/cuda/samples/common/inc
$ ./t1348
[0] 0 0 299 463 0 0 315 333
[1] 0 0 315 333 0 0 117 437
[2] 0 0 117 437 0 0 119 683
[3] 0 0 119 683 0 0 326 498
[4] 0 0 326 498 0 0 390 966
[5] 0 0 390 966 0 0 321 340
[6] 0 0 321 340 0 0 77 271
[7] 0 0 77 271 0 0 553 599
[8] 0 0 553 599 0 0 73 929
[9] 0 0 73 929 0 0 614 1164
$ cuda-memcheck ./t1348
========= CUDA-MEMCHECK
[0] 0 0 664 868 0 0 272 672
[1] 0 0 272 672 0 0 458 491
[2] 0 0 458 491 0 0 555 618
[3] 0 0 555 618 0 0 268 459
[4] 0 0 268 459 0 0 349 546
[5] 0 0 349 546 0 0 13 366
[6] 0 0 13 366 0 0 96 185
[7] 0 0 96 185 0 0 543 1004
[8] 0 0 543 1004 0 0 54 138
[9] 0 0 54 138 0 0 7 500
========= ERROR SUMMARY: 0 errors
$