callback functions and thread synchronization

I sort of modified the simpleCallback.cu program, that you can find in the Cuda samples, to be used with c++11 std::thread, but I’m not able to get the correct synchronization. Here’s the source code:

#include <iostream>
#include <utility>
#include <thread>
#include <chrono>
#include <functional>
#include <atomic>
#include <condition_variable>
#include <vector>
#include <iterator>

#include "helper_cuda.h"

#define N 100

class Barrier {

public:
    Barrier(int max): count(0), maxCount(max) {}

    int count;
    int maxCount;
    std::mutex m;
    std::condition_variable icv;

};

class Workload {

public:
    int id;
    std::thread::id thid;
    int cudaDeviceID;

    int *h_data;
    int *d_data;
    cudaStream_t stream;

    Barrier* wall;

    bool success;
    void initData() {
        h_data = (int *)malloc(N*sizeof(int));
        for (int i=0; i<N; ++i)
            h_data[i] = i;
    }

};

__global__ void simpleKernel(int* data, int id) {

    /*
     * Simple kernel: multiplication of a vector by a scalar constant (worker id)
     */

    int idx = threadIdx.x + blockDim.x * blockIdx.x;
    while (idx < N) {
       data[idx]*=id;
       idx += blockDim.x*gridDim.x;
    }

}

void* dataPostProcess(void* data) {

    Workload* wrk = (Workload*)data; // Cast void* back to Workload*
    checkCudaErrors(cudaSetDevice(wrk->cudaDeviceID)); // Keep the same device associated to the Workload

    // Do something with data...

    // Free Resources
    checkCudaErrors(cudaFree(wrk->d_data));
    checkCudaErrors(cudaFreeHost(wrk->h_data));
    checkCudaErrors(cudaStreamDestroy(wrk->stream));

    std::lock_guard<std::mutex> lock(wrk->wall->m);
    wrk->wall->count++;
    if (wrk->wall->count >= wrk->wall->maxCount) {
        wrk->wall->icv.notify_one();
    }

    return 0;

}

void callback(cudaStream_t stream, cudaError_t status, void* data) {

    checkCudaErrors(status);
    auto t = std::thread(dataPostProcess, data); // GPU controlled CPU thread
    t.join();

}

void* launch(Workload* wrk) {

    std::thread::id this_id = std::this_thread::get_id();
    wrk->thid = this_id;

    dim3 grid(16, 1, 1);
    dim3 block(16, 1, 1);
    checkCudaErrors(cudaStreamCreate(&wrk->stream)); // Stream creation
    checkCudaErrors(cudaHostAlloc(&wrk->h_data, N*sizeof(int), cudaHostAllocPortable)); // Allocation of pinned memory @host 
    checkCudaErrors(cudaMalloc(&wrk->d_data, N*sizeof(int))); // Allocation of device memory

    wrk->initData(); // Data initialization on host
    checkCudaErrors(cudaMemcpyAsync(wrk->d_data, wrk->h_data, N * sizeof(int), cudaMemcpyHostToDevice, wrk->stream));

    simpleKernel<<<grid, block, 0, wrk->stream>>>(wrk->d_data, wrk->id); // Kernel call

    checkCudaErrors(cudaMemcpyAsync(wrk->h_data, wrk->d_data, N * sizeof(int), cudaMemcpyDeviceToHost, wrk->stream));

    // cast of Workload to void* type, as it's the only one that cudaStreamAddCallback is able to digest
    void* arg = (void*)wrk;

    checkCudaErrors(cudaStreamAddCallback(wrk->stream, callback, arg, 0)); // callback call

    return 0;
}

int main(int argc, char* argv[]) {

    int NWorkers = 5; // Number of workers
    int devId = 0;    // Device Id
    int devNum = 2;   // Max number of Gpu on my system
    int gpuInfo[2];
    int maxDev = 0;

    for (int devId=0; devId<devNum; ++devId) {

        cudaDeviceProp dprop;
        checkCudaErrors(cudaGetDeviceProperties(&dprop, devId));

        int SMversion = dprop.major << 4 + dprop.minor;
        printf("GPU[%d] %s supports SM %d.%d", devId, dprop.name, dprop.major, dprop.minor);
        printf(", %s GPU Callback Functions\n", (SMversion >= 0x11) ? "capable" : "NOT capable");

        if (SMversion >= 0x11) {
            gpuInfo[maxDev++] = devId;
        }

    }

    checkCudaErrors(cudaSetDevice(devId));

    Barrier wall(NWorkers);      // Barrier instance
    std::vector<Workload*> wvec; // Simple Workloads (taken from CUDA simpleCallback example)

    for (int i=0; i<NWorkers; ++i) {
        Workload* w = new Workload();
        w->wall = &wall;
        w->id = i;
        w->cudaDeviceID = devId;
        wvec.push_back(w);
    }

    std::vector<std::thread> tlist; // Vector of std::thread's

    for (int i = 0; i<NWorkers; i++) {
        tlist.push_back(std::thread(launch, wvec[i]));
    }

    for (int i = 0; i<NWorkers; i++) {
        tlist[i].join();
    }

    std::unique_lock<std::mutex> lock(wall.m);

    while (wall.count < NWorkers) {
        wall.icv.wait(lock);
    }

    lock.unlock();

    checkCudaErrors(cudaDeviceReset()); // reset the device

    return 0;

}

compiled with:
nvcc -ccbin g++ --std=c++11 -I/usr/local/cuda/samples/common/inc -lpthread -o GpuCpuCallback GpuCpuCallback.cu

g++ version is 4.9, checkCudaErrors is picked from the Cuda SDK installation

If I avoid using the Barrier, that is commenting out lines 75 to 79 and 165 to 171, the program hangs indefinitely trying to join the threads (line 162). If I try to use the Barrier, keeping the join of the thread spawned by the callback function (line 89), the program hangs again. If I finally try to keep the Barrier but avoid the join at line 89, the program ends with:

terminate called without an active exception

that is the usual abortion message when a thread is not properly synchronized.
The original simpleCallback is just using the Barrier to notify the main thread of the completion of the other ones, so no explicit join()s are called. Am I doing something wrong or “illegal” using std::thread’s?
Thanks in advance for any help or suggestions.

On my machine (Fedora 20, gcc 4.8.3, CUDA 7.5) if I comment out the following 2 lines in dataPostProcess:

void* dataPostProcess(void* data) {

    Workload* wrk = (Workload*)data; // Cast void* back to Workload*
    checkCudaErrors(cudaSetDevice(wrk->cudaDeviceID)); // Keep the same device associated to the Workload

    // Do something with data...

    // Free Resources
    // checkCudaErrors(cudaFree(wrk->d_data));       THIS LINE COMMENTED OUT
    // checkCudaErrors(cudaFreeHost(wrk->h_data));   THIS LINE COMMENTED OUT
    checkCudaErrors(cudaStreamDestroy(wrk->stream));

    std::lock_guard<std::mutex> lock(wrk->wall->m);
    wrk->wall->count++;
    if (wrk->wall->count >= wrk->wall->maxCount) {
        wrk->wall->icv.notify_one();
    }

    return 0;

}

then your code as posted runs to completion for me, apparently without error.

It appears that the call to cudaFree is actually hanging. If you comment out that line, the following line call to cudaFreeHost will throw an error as indicated by the cuda error checking macro.

I don’t have an explanation for any of this at this time, just sharing an observation.