callback functions and thread synchronization

RedCrow · May 13, 2016, 12:15pm

I sort of modified the simpleCallback.cu program, that you can find in the Cuda samples, to be used with c++11 std::thread, but I’m not able to get the correct synchronization. Here’s the source code:

#include <iostream>
#include <utility>
#include <thread>
#include <chrono>
#include <functional>
#include <atomic>
#include <condition_variable>
#include <vector>
#include <iterator>

#include "helper_cuda.h"

#define N 100

class Barrier {

public:
    Barrier(int max): count(0), maxCount(max) {}

    int count;
    int maxCount;
    std::mutex m;
    std::condition_variable icv;

};

class Workload {

public:
    int id;
    std::thread::id thid;
    int cudaDeviceID;

    int *h_data;
    int *d_data;
    cudaStream_t stream;

    Barrier* wall;

    bool success;
    void initData() {
        h_data = (int *)malloc(N*sizeof(int));
        for (int i=0; i<N; ++i)
            h_data[i] = i;
    }

};

__global__ void simpleKernel(int* data, int id) {

    /*
     * Simple kernel: multiplication of a vector by a scalar constant (worker id)
     */

    int idx = threadIdx.x + blockDim.x * blockIdx.x;
    while (idx < N) {
       data[idx]*=id;
       idx += blockDim.x*gridDim.x;
    }

}

void* dataPostProcess(void* data) {

    Workload* wrk = (Workload*)data; // Cast void* back to Workload*
    checkCudaErrors(cudaSetDevice(wrk->cudaDeviceID)); // Keep the same device associated to the Workload

    // Do something with data...

    // Free Resources
    checkCudaErrors(cudaFree(wrk->d_data));
    checkCudaErrors(cudaFreeHost(wrk->h_data));
    checkCudaErrors(cudaStreamDestroy(wrk->stream));

    std::lock_guard<std::mutex> lock(wrk->wall->m);
    wrk->wall->count++;
    if (wrk->wall->count >= wrk->wall->maxCount) {
        wrk->wall->icv.notify_one();
    }

    return 0;

}

void callback(cudaStream_t stream, cudaError_t status, void* data) {

    checkCudaErrors(status);
    auto t = std::thread(dataPostProcess, data); // GPU controlled CPU thread
    t.join();

}

void* launch(Workload* wrk) {

    std::thread::id this_id = std::this_thread::get_id();
    wrk->thid = this_id;

    dim3 grid(16, 1, 1);
    dim3 block(16, 1, 1);
    checkCudaErrors(cudaStreamCreate(&wrk->stream)); // Stream creation
    checkCudaErrors(cudaHostAlloc(&wrk->h_data, N*sizeof(int), cudaHostAllocPortable)); // Allocation of pinned memory @host 
    checkCudaErrors(cudaMalloc(&wrk->d_data, N*sizeof(int))); // Allocation of device memory

    wrk->initData(); // Data initialization on host
    checkCudaErrors(cudaMemcpyAsync(wrk->d_data, wrk->h_data, N * sizeof(int), cudaMemcpyHostToDevice, wrk->stream));

    simpleKernel<<<grid, block, 0, wrk->stream>>>(wrk->d_data, wrk->id); // Kernel call

    checkCudaErrors(cudaMemcpyAsync(wrk->h_data, wrk->d_data, N * sizeof(int), cudaMemcpyDeviceToHost, wrk->stream));

    // cast of Workload to void* type, as it's the only one that cudaStreamAddCallback is able to digest
    void* arg = (void*)wrk;

    checkCudaErrors(cudaStreamAddCallback(wrk->stream, callback, arg, 0)); // callback call

    return 0;
}

int main(int argc, char* argv[]) {

    int NWorkers = 5; // Number of workers
    int devId = 0;    // Device Id
    int devNum = 2;   // Max number of Gpu on my system
    int gpuInfo[2];
    int maxDev = 0;

    for (int devId=0; devId<devNum; ++devId) {

        cudaDeviceProp dprop;
        checkCudaErrors(cudaGetDeviceProperties(&dprop, devId));

        int SMversion = dprop.major << 4 + dprop.minor;
        printf("GPU[%d] %s supports SM %d.%d", devId, dprop.name, dprop.major, dprop.minor);
        printf(", %s GPU Callback Functions\n", (SMversion >= 0x11) ? "capable" : "NOT capable");

        if (SMversion >= 0x11) {
            gpuInfo[maxDev++] = devId;
        }

    }

    checkCudaErrors(cudaSetDevice(devId));

    Barrier wall(NWorkers);      // Barrier instance
    std::vector<Workload*> wvec; // Simple Workloads (taken from CUDA simpleCallback example)

    for (int i=0; i<NWorkers; ++i) {
        Workload* w = new Workload();
        w->wall = &wall;
        w->id = i;
        w->cudaDeviceID = devId;
        wvec.push_back(w);
    }

    std::vector<std::thread> tlist; // Vector of std::thread's

    for (int i = 0; i<NWorkers; i++) {
        tlist.push_back(std::thread(launch, wvec[i]));
    }

    for (int i = 0; i<NWorkers; i++) {
        tlist[i].join();
    }

    std::unique_lock<std::mutex> lock(wall.m);

    while (wall.count < NWorkers) {
        wall.icv.wait(lock);
    }

    lock.unlock();

    checkCudaErrors(cudaDeviceReset()); // reset the device

    return 0;

}

compiled with:
nvcc -ccbin g++ --std=c++11 -I/usr/local/cuda/samples/common/inc -lpthread -o GpuCpuCallback GpuCpuCallback.cu

g++ version is 4.9, checkCudaErrors is picked from the Cuda SDK installation

If I avoid using the Barrier, that is commenting out lines 75 to 79 and 165 to 171, the program hangs indefinitely trying to join the threads (line 162). If I try to use the Barrier, keeping the join of the thread spawned by the callback function (line 89), the program hangs again. If I finally try to keep the Barrier but avoid the join at line 89, the program ends with:

terminate called without an active exception

that is the usual abortion message when a thread is not properly synchronized.
The original simpleCallback is just using the Barrier to notify the main thread of the completion of the other ones, so no explicit join()s are called. Am I doing something wrong or “illegal” using std::thread’s?
Thanks in advance for any help or suggestions.

Robert_Crovella · May 14, 2016, 1:44am

On my machine (Fedora 20, gcc 4.8.3, CUDA 7.5) if I comment out the following 2 lines in dataPostProcess:

void* dataPostProcess(void* data) {

    Workload* wrk = (Workload*)data; // Cast void* back to Workload*
    checkCudaErrors(cudaSetDevice(wrk->cudaDeviceID)); // Keep the same device associated to the Workload

    // Do something with data...

    // Free Resources
    // checkCudaErrors(cudaFree(wrk->d_data));       THIS LINE COMMENTED OUT
    // checkCudaErrors(cudaFreeHost(wrk->h_data));   THIS LINE COMMENTED OUT
    checkCudaErrors(cudaStreamDestroy(wrk->stream));

    std::lock_guard<std::mutex> lock(wrk->wall->m);
    wrk->wall->count++;
    if (wrk->wall->count >= wrk->wall->maxCount) {
        wrk->wall->icv.notify_one();
    }

    return 0;

}

then your code as posted runs to completion for me, apparently without error.

It appears that the call to cudaFree is actually hanging. If you comment out that line, the following line call to cudaFreeHost will throw an error as indicated by the cuda error checking macro.

I don’t have an explanation for any of this at this time, just sharing an observation.

Topic		Replies	Views
Synchronization hangs sporadically after kernel launch CUDA Programming and Performance	23	7840	August 20, 2015
Passing dynamically allocated memory in kernel to sub kernel via dynamic parallelism CUDA Programming and Performance	6	693	May 3, 2019
cudaSynchronizeDevice() returns error code 6 CUDA Programming and Performance	7	8609	June 16, 2011
Computation and PCIe tranfers overlaping with callbacks and events. CUDA Programming and Performance	7	915	July 7, 2016
unable to get the cpu and gpu to run in parallel CUDA Programming and Performance	34	23225	October 7, 2010
Using unified memory causes system crash CUDA Programming and Performance	28	5912	February 4, 2019
Overlapping CPU and GPU operations using streams. Total failure. Any help? CUDA Programming and Performance	6	6044	April 2, 2013
QUIT CUDA? Kernel and pinned memory gives strange results CUDA Programming and Performance	6	6721	September 22, 2011
Why does cudaStreamAddCallback serialize kernel execution and break concurrency? CUDA Programming and Performance	12	8093	April 5, 2015
Parallel Reduction CUDA Programming and Performance	10	3736	June 26, 2011

callback functions and thread synchronization

Related topics