Stop kernel function as soon as condition is met (CUDA)

Given an array of bools the kernel function should check if at least for one index the flag is set to false. Then if this occurs it sets not_terminated to true and exits immediately.

t1.cu

#include <algorithm>
#include <iostream>
#include <vector>

template <size_t n>
__global__ void lock_check(const int num_rows, const bool* dependant_locks,
                           int* not_terminated)
{
    int index = blockIdx.x * blockDim.x + threadIdx.x;

    volatile __shared__ int warp_found;
    if (threadIdx.x == 0)
        warp_found = *not_terminated;
    __syncthreads();

    size_t i = 0;
    while (!warp_found && i < n)
    {
        if (index + i >= num_rows)
            return;

        if (dependant_locks[index + i] == false)
        {
            // atomicExch(&warp_found, 1);
            warp_found = true;
            atomicCAS(not_terminated, 0, true);
            // *not_terminated = true;
        }

        if (threadIdx.x == 0 && *not_terminated)
            warp_found = true;

        i++;
        __syncthreads();
    }
}

int main(int argc, const char* argv[])
{
    std::vector<bool> check(51813503);
    std::fill(check.begin(), check.end(), true);
    check[27138944] = false;
    check[27139152] = false;
    check[27139200] = false;
    check[27139206] = false;
    check[27139207] = false;
    check[27139208] = false;
    check[27139209] = false;
    check[27139210] = false;
    check[27139211] = false;
    check[27139212] = false;

    bool* dev_check;
    int* dev_not_terminated;
    cudaMalloc(&dev_check, 51813503 * sizeof(bool));
    cudaMemcpy(dev_check, check.data(), 51813503 * sizeof(bool),
               cudaMemcpyHostToDevice);

    int not_terminated = false;
    cudaMalloc(&dev_not_terminated, sizeof(int));
    cudaMemcpy(dev_not_terminated, &not_terminated, sizeof(int),
               cudaMemcpyHostToDevice);

    dim3 threads_per_block(128, 1, 1);
    constexpr size_t n = 2;
    int blocks = ceil(51813503 / (n * 128)) + 1;
    dim3 blocks_per_grid(blocks, 1, 1);

    lock_check<n><<<blocks_per_grid, threads_per_block>>>(51813503, dev_check,
                                                          dev_not_terminated);

    cudaMemcpy(&not_terminated, dev_not_terminated, sizeof(int),
               cudaMemcpyDeviceToHost);
    if (not_terminated)
        printf("detected\n");
    else
        printf("not detected\n");

    return 0;
}

I tested for race conditions with

$ compute-sanitizer/compute-sanitizer --launch-timeout 0 --racecheck-detect-level info --racecheck-report all ./a.out
========= COMPUTE-SANITIZER
not detected
========= ERROR SUMMARY: 0 errors

but nothing was detected. The kernel function above does not work (it is very similar to how can a global function RETURN a value or BREAK out like C/C++ does).

So what am I missing here? Am I doing something wrong with shared memory? Moreover if you have a more efficient approach to solve this I would gladly follow it (I am still a beginner).

If you are looking for a simple solution to find out if at least one flag is set to false, I would suggest using thrust::any_of

#include <thrust/functional.h>
#include <thrust/logical.h>

bool atLeastOnIsFalse = thrust::any_of(
    thrust::device,
    dev_check,
    dev_check + 51813503,
    thrust::logical_not<bool>{}
);

1 Like

Thank you for your answer. I just tested it and it works.
However I would really much prefer not to use external libraries, also I need to reuse the code above for other purposes.

There are two mistakes in your code.
First, this code has undefined behaviour.

if (index + i >= num_rows)
            return;
....
__syncthreads();

You need to make sure that all threads execute syncthreads()

Second, it seems that each thread should process n array positions. Then, the array access index should be n * index + i, assuming each thread processes consecutive entries , not index + i . With index+i you never access the false elements.

This modified kernel prints “detected\n”;

template <size_t n>
__global__ void lock_check(const int num_rows, const bool* dependant_locks,
                           int* not_terminated)
{
    int index = blockIdx.x * blockDim.x + threadIdx.x;

    volatile __shared__ int warp_found;
    if (threadIdx.x == 0)
        warp_found = *not_terminated;
    __syncthreads();

    size_t i = 0;
    while (!warp_found && i < n)
    {
        if (n*index + i < num_rows){


                if (dependant_locks[n*index + i] == false)
                {
                    // atomicExch(&warp_found, 1);
                    warp_found = true;
                    atomicCAS(not_terminated, 0, true);
                    // *not_terminated = true;
                }

                if (threadIdx.x == 0 && *not_terminated)
                    warp_found = true;
        }
        i++;
        __syncthreads();
    }
}

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.