Immediate termination of all threads after the condition is met

tiputa · March 12, 2023, 8:26am

What is the correct way to terminate all threads in a block after a condition is met?

The kernel program below still runs for the same amount of time regardless of changing test conditions:

if (threadIdx.x == 512 && a == 1000000ull)

if (threadIdx.x == 0 && a == 1000ull)


#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>
#include <iostream>
#include <time.h>


// NVIDIA GeForce GTX 960M (compute/sm 50, 5x SM)
# define blocks 1
# define threads 1024

cudaError_t cudaStatus;

__global__ void blackcat(void) {

	uint64_t n = 1000000000ull;	// 1E9, 29.493572 s, test b = 2048

	uint64_t a = 0;

	__shared__ bool flag;

	flag = false;

	while (n --> 0) {
		a++;
		if (flag) {
			break;
		}
		if (threadIdx.x == 512 && a == 1000000ull) {	// 1E6, just 1 block
			printf("%lld\n", a);
			flag = true;
		}
	}
}

int main() {

	cudaEvent_t start, stop;	// CUDA time
	float time;
	cudaEventCreate(&start);
	cudaEventCreate(&stop);

	cudaSetDevice(0);
	cudaStatus = cudaGetLastError();
	if (cudaStatus != cudaSuccess) {
		fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
	}

	cudaEventRecord(start, 0);
	blackcat << <blocks, threads >> > ();
	cudaEventRecord(stop, 0);
	cudaEventSynchronize(stop);

	cudaEventElapsedTime(&time, start, stop);
	cudaEventDestroy(start);
	cudaEventDestroy(stop);
	printf("CUDA time is %f s\n", time/1000);

	cudaStatus = cudaGetLastError();
	if (cudaStatus != cudaSuccess) {
		fprintf(stderr, "Kernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
	}
	cudaDeviceSynchronize();
	cudaStatus = cudaGetLastError();
	if (cudaStatus != cudaSuccess) {
		fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
	}

	return 0;
}

Robert_Crovella · March 12, 2023, 7:48pm

The compiler can optimize its usage of a variable if you don’t inform it otherwise. The flag variable here is getting “optimized into a register”, meaning that when one thread updates the shared location, other threads don’t see it. You need to use a memory barrier of some sort, or else another approach is to use volatile decorator. This is a general principle to be aware of any time you are communicating between threads.

Example:

$ cat t2214.cu
#include <stdio.h>
#include <iostream>
#include <time.h>


// NVIDIA GeForce GTX 960M (compute/sm 50, 5x SM)
# define blocks 1
# define threads 1024

cudaError_t cudaStatus;

__global__ void blackcat(void) {

        uint64_t n = 1000000000ull;     // 1E9, 29.493572 s, test b = 2048

        uint64_t a = 0;



#ifdef USE_VOLATILE
        __shared__ volatile bool flag;
#else
        __shared__ bool flag;
#endif

        flag = false;

        while (n --> 0) {
                a++;
                if (flag) {
                        break;
                }
#ifdef USE_SHORT
                if (threadIdx.x == 0 && a == 1000ull){
#else
                if (threadIdx.x == 512 && a == 1000000ull) {    // 1E6, just 1 block
#endif
                        printf("%lld\n", a);
                        flag = true;
                }
        }
}

int main() {

        cudaEvent_t start, stop;        // CUDA time
        float time;
        cudaEventCreate(&start);
        cudaEventCreate(&stop);

        cudaSetDevice(0);
        cudaStatus = cudaGetLastError();
        if (cudaStatus != cudaSuccess) {
                fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
        }

        cudaEventRecord(start, 0);
        blackcat << <blocks, threads >> > ();
        cudaEventRecord(stop, 0);
        cudaEventSynchronize(stop);

        cudaEventElapsedTime(&time, start, stop);
        cudaEventDestroy(start);
        cudaEventDestroy(stop);
        printf("CUDA time is %f s\n", time/1000);

        cudaStatus = cudaGetLastError();
        if (cudaStatus != cudaSuccess) {
                fprintf(stderr, "Kernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
        }
        cudaDeviceSynchronize();
        cudaStatus = cudaGetLastError();
        if (cudaStatus != cudaSuccess) {
                fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
        }

        return 0;
}
$ nvcc -o t2214 t2214.cu
$ time ./t2214
1000000
CUDA time is 8.491987 s

real    0m8.928s
user    0m5.706s
sys     0m3.203s
$ nvcc -o t2214 t2214.cu -DUSE_SHORT
$ time ./t2214
1000
CUDA time is 9.748708 s

real    0m10.186s
user    0m6.395s
sys     0m3.779s
$ nvcc -o t2214 t2214.cu -DUSE_VOLATILE
$ time ./t2214
1000000
CUDA time is 0.041508 s

real    0m0.447s
user    0m0.066s
sys     0m0.379s
$ nvcc -o t2214 t2214.cu -DUSE_VOLATILE -DUSE_SHORT
$ time ./t2214
1000
CUDA time is 0.004152 s

real    0m0.464s
user    0m0.046s
sys     0m0.397s
$

We see that in the non-volatile case, the “short” path is not shorter. In the volatile case, the short path is 10x shorter than the long path.

You can read about memory barriers and the volatile decorator in the programming guide.

tiputa · March 13, 2023, 2:50pm

Thank you. I have looked at memory barriers and volatile, but without an example it’s out of my understanding how to program it.

njuffa · March 13, 2023, 5:48pm

Robert_Crovella already provided an example of how to use volatile here:

The beauty of CUDA is that code is written from the viewpoint of a single thread, allowing the actual degree of parallelism to be configured independently of the code at run time.

The semantics of the volatile modifier in C++ are roughly “this data object is modified by an agent outside the current thread of execution”. Which is exactly what happens here: For the vast majority of threads, the thread writing flag is different from the thread reading flag.

As a consequence of the use of volatile, the compiler needs to generate code such that every textual access to such an object results in an actual access to the physical object, in this case a shared-memory location. The data cannot be pulled into a register for faster access.

Note that volatile is generally not a replacement for synchronization primitives but may be used in conjunction with them.

tiputa · March 13, 2023, 7:44pm

Perfect, it seems it’s doing what I needed. Thank you.

CUDA time is 0.135015 s //if (threadIdx.x == 512 && a == 1000000ull
CUDA time is 124.212807 s //if (threadIdx.x == 2048 && a == 1000000ull)

For:

# define blocks 1
# define threads 1024

Topic		Replies	Views
Stop kernel function as soon as condition is met (CUDA) CUDA Programming and Performance	5	606	July 18, 2023
while () {;} statement eliminated by compiler CUDA Programming and Performance	3	1487	April 24, 2011
no thread with threadIdx.x==0 && threadIdx.y ==0? CUDA Programming and Performance	3	11829	March 14, 2011
Kill execution of a block CUDA Programming and Performance	10	1452	August 9, 2010
Compiler optimisation effecting code correctness CUDA Programming and Performance	5	786	November 25, 2011
Do CUDA volatiles work? CUDA Programming and Performance	0	2213	November 23, 2009
How is the compiler optimizing the thread launch? CUDA Programming and Performance	12	446	October 26, 2022
A stupid question on __syncthread() function CUDA Programming and Performance	5	5867	May 17, 2022
32 thread block doesn't need _syncthreads()? CUDA Programming and Performance	18	13970	January 21, 2024
Can one thread stop the other thread ? in the same block? CUDA Programming and Performance	0	2654	May 6, 2008

Immediate termination of all threads after the condition is met

Related topics