I sort of modified the simpleCallback.cu program, that you can find in the Cuda samples, to be used with c++11 std::thread, but I’m not able to get the correct synchronization. Here’s the source code:
#include <iostream>
#include <utility>
#include <thread>
#include <chrono>
#include <functional>
#include <atomic>
#include <condition_variable>
#include <vector>
#include <iterator>
#include "helper_cuda.h"
#define N 100
class Barrier {
public:
Barrier(int max): count(0), maxCount(max) {}
int count;
int maxCount;
std::mutex m;
std::condition_variable icv;
};
class Workload {
public:
int id;
std::thread::id thid;
int cudaDeviceID;
int *h_data;
int *d_data;
cudaStream_t stream;
Barrier* wall;
bool success;
void initData() {
h_data = (int *)malloc(N*sizeof(int));
for (int i=0; i<N; ++i)
h_data[i] = i;
}
};
__global__ void simpleKernel(int* data, int id) {
/*
* Simple kernel: multiplication of a vector by a scalar constant (worker id)
*/
int idx = threadIdx.x + blockDim.x * blockIdx.x;
while (idx < N) {
data[idx]*=id;
idx += blockDim.x*gridDim.x;
}
}
void* dataPostProcess(void* data) {
Workload* wrk = (Workload*)data; // Cast void* back to Workload*
checkCudaErrors(cudaSetDevice(wrk->cudaDeviceID)); // Keep the same device associated to the Workload
// Do something with data...
// Free Resources
checkCudaErrors(cudaFree(wrk->d_data));
checkCudaErrors(cudaFreeHost(wrk->h_data));
checkCudaErrors(cudaStreamDestroy(wrk->stream));
std::lock_guard<std::mutex> lock(wrk->wall->m);
wrk->wall->count++;
if (wrk->wall->count >= wrk->wall->maxCount) {
wrk->wall->icv.notify_one();
}
return 0;
}
void callback(cudaStream_t stream, cudaError_t status, void* data) {
checkCudaErrors(status);
auto t = std::thread(dataPostProcess, data); // GPU controlled CPU thread
t.join();
}
void* launch(Workload* wrk) {
std::thread::id this_id = std::this_thread::get_id();
wrk->thid = this_id;
dim3 grid(16, 1, 1);
dim3 block(16, 1, 1);
checkCudaErrors(cudaStreamCreate(&wrk->stream)); // Stream creation
checkCudaErrors(cudaHostAlloc(&wrk->h_data, N*sizeof(int), cudaHostAllocPortable)); // Allocation of pinned memory @host
checkCudaErrors(cudaMalloc(&wrk->d_data, N*sizeof(int))); // Allocation of device memory
wrk->initData(); // Data initialization on host
checkCudaErrors(cudaMemcpyAsync(wrk->d_data, wrk->h_data, N * sizeof(int), cudaMemcpyHostToDevice, wrk->stream));
simpleKernel<<<grid, block, 0, wrk->stream>>>(wrk->d_data, wrk->id); // Kernel call
checkCudaErrors(cudaMemcpyAsync(wrk->h_data, wrk->d_data, N * sizeof(int), cudaMemcpyDeviceToHost, wrk->stream));
// cast of Workload to void* type, as it's the only one that cudaStreamAddCallback is able to digest
void* arg = (void*)wrk;
checkCudaErrors(cudaStreamAddCallback(wrk->stream, callback, arg, 0)); // callback call
return 0;
}
int main(int argc, char* argv[]) {
int NWorkers = 5; // Number of workers
int devId = 0; // Device Id
int devNum = 2; // Max number of Gpu on my system
int gpuInfo[2];
int maxDev = 0;
for (int devId=0; devId<devNum; ++devId) {
cudaDeviceProp dprop;
checkCudaErrors(cudaGetDeviceProperties(&dprop, devId));
int SMversion = dprop.major << 4 + dprop.minor;
printf("GPU[%d] %s supports SM %d.%d", devId, dprop.name, dprop.major, dprop.minor);
printf(", %s GPU Callback Functions\n", (SMversion >= 0x11) ? "capable" : "NOT capable");
if (SMversion >= 0x11) {
gpuInfo[maxDev++] = devId;
}
}
checkCudaErrors(cudaSetDevice(devId));
Barrier wall(NWorkers); // Barrier instance
std::vector<Workload*> wvec; // Simple Workloads (taken from CUDA simpleCallback example)
for (int i=0; i<NWorkers; ++i) {
Workload* w = new Workload();
w->wall = &wall;
w->id = i;
w->cudaDeviceID = devId;
wvec.push_back(w);
}
std::vector<std::thread> tlist; // Vector of std::thread's
for (int i = 0; i<NWorkers; i++) {
tlist.push_back(std::thread(launch, wvec[i]));
}
for (int i = 0; i<NWorkers; i++) {
tlist[i].join();
}
std::unique_lock<std::mutex> lock(wall.m);
while (wall.count < NWorkers) {
wall.icv.wait(lock);
}
lock.unlock();
checkCudaErrors(cudaDeviceReset()); // reset the device
return 0;
}
compiled with:
nvcc -ccbin g++ --std=c++11 -I/usr/local/cuda/samples/common/inc -lpthread -o GpuCpuCallback GpuCpuCallback.cu
g++ version is 4.9, checkCudaErrors is picked from the Cuda SDK installation
If I avoid using the Barrier, that is commenting out lines 75 to 79 and 165 to 171, the program hangs indefinitely trying to join the threads (line 162). If I try to use the Barrier, keeping the join of the thread spawned by the callback function (line 89), the program hangs again. If I finally try to keep the Barrier but avoid the join at line 89, the program ends with:
terminate called without an active exception
that is the usual abortion message when a thread is not properly synchronized.
The original simpleCallback is just using the Barrier to notify the main thread of the completion of the other ones, so no explicit join()s are called. Am I doing something wrong or “illegal” using std::thread’s?
Thanks in advance for any help or suggestions.