Immediate termination of all threads after the condition is met

What is the correct way to terminate all threads in a block after a condition is met?

The kernel program below still runs for the same amount of time regardless of changing test conditions:

if (threadIdx.x == 512 && a == 1000000ull)
if (threadIdx.x == 0 && a == 1000ull)

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>
#include <iostream>
#include <time.h>


// NVIDIA GeForce GTX 960M (compute/sm 50, 5x SM)
# define blocks 1
# define threads 1024

cudaError_t cudaStatus;

__global__ void blackcat(void) {

	uint64_t n = 1000000000ull;	// 1E9, 29.493572 s, test b = 2048

	uint64_t a = 0;

	__shared__ bool flag;

	flag = false;

	while (n --> 0) {
		a++;
		if (flag) {
			break;
		}
		if (threadIdx.x == 512 && a == 1000000ull) {	// 1E6, just 1 block
			printf("%lld\n", a);
			flag = true;
		}
	}
}

int main() {

	cudaEvent_t start, stop;	// CUDA time
	float time;
	cudaEventCreate(&start);
	cudaEventCreate(&stop);

	cudaSetDevice(0);
	cudaStatus = cudaGetLastError();
	if (cudaStatus != cudaSuccess) {
		fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
	}

	cudaEventRecord(start, 0);
	blackcat << <blocks, threads >> > ();
	cudaEventRecord(stop, 0);
	cudaEventSynchronize(stop);

	cudaEventElapsedTime(&time, start, stop);
	cudaEventDestroy(start);
	cudaEventDestroy(stop);
	printf("CUDA time is %f s\n", time/1000);

	cudaStatus = cudaGetLastError();
	if (cudaStatus != cudaSuccess) {
		fprintf(stderr, "Kernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
	}
	cudaDeviceSynchronize();
	cudaStatus = cudaGetLastError();
	if (cudaStatus != cudaSuccess) {
		fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
	}

	return 0;
}

The compiler can optimize its usage of a variable if you don’t inform it otherwise. The flag variable here is getting “optimized into a register”, meaning that when one thread updates the shared location, other threads don’t see it. You need to use a memory barrier of some sort, or else another approach is to use volatile decorator. This is a general principle to be aware of any time you are communicating between threads.

Example:

$ cat t2214.cu
#include <stdio.h>
#include <iostream>
#include <time.h>


// NVIDIA GeForce GTX 960M (compute/sm 50, 5x SM)
# define blocks 1
# define threads 1024

cudaError_t cudaStatus;

__global__ void blackcat(void) {

        uint64_t n = 1000000000ull;     // 1E9, 29.493572 s, test b = 2048

        uint64_t a = 0;



#ifdef USE_VOLATILE
        __shared__ volatile bool flag;
#else
        __shared__ bool flag;
#endif

        flag = false;

        while (n --> 0) {
                a++;
                if (flag) {
                        break;
                }
#ifdef USE_SHORT
                if (threadIdx.x == 0 && a == 1000ull){
#else
                if (threadIdx.x == 512 && a == 1000000ull) {    // 1E6, just 1 block
#endif
                        printf("%lld\n", a);
                        flag = true;
                }
        }
}

int main() {

        cudaEvent_t start, stop;        // CUDA time
        float time;
        cudaEventCreate(&start);
        cudaEventCreate(&stop);

        cudaSetDevice(0);
        cudaStatus = cudaGetLastError();
        if (cudaStatus != cudaSuccess) {
                fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
        }

        cudaEventRecord(start, 0);
        blackcat << <blocks, threads >> > ();
        cudaEventRecord(stop, 0);
        cudaEventSynchronize(stop);

        cudaEventElapsedTime(&time, start, stop);
        cudaEventDestroy(start);
        cudaEventDestroy(stop);
        printf("CUDA time is %f s\n", time/1000);

        cudaStatus = cudaGetLastError();
        if (cudaStatus != cudaSuccess) {
                fprintf(stderr, "Kernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
        }
        cudaDeviceSynchronize();
        cudaStatus = cudaGetLastError();
        if (cudaStatus != cudaSuccess) {
                fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
        }

        return 0;
}
$ nvcc -o t2214 t2214.cu
$ time ./t2214
1000000
CUDA time is 8.491987 s

real    0m8.928s
user    0m5.706s
sys     0m3.203s
$ nvcc -o t2214 t2214.cu -DUSE_SHORT
$ time ./t2214
1000
CUDA time is 9.748708 s

real    0m10.186s
user    0m6.395s
sys     0m3.779s
$ nvcc -o t2214 t2214.cu -DUSE_VOLATILE
$ time ./t2214
1000000
CUDA time is 0.041508 s

real    0m0.447s
user    0m0.066s
sys     0m0.379s
$ nvcc -o t2214 t2214.cu -DUSE_VOLATILE -DUSE_SHORT
$ time ./t2214
1000
CUDA time is 0.004152 s

real    0m0.464s
user    0m0.046s
sys     0m0.397s
$

We see that in the non-volatile case, the “short” path is not shorter. In the volatile case, the short path is 10x shorter than the long path.

You can read about memory barriers and the volatile decorator in the programming guide.

Thank you. I have looked at memory barriers and volatile, but without an example it’s out of my understanding how to program it.

Robert_Crovella already provided an example of how to use volatile here:

The beauty of CUDA is that code is written from the viewpoint of a single thread, allowing the actual degree of parallelism to be configured independently of the code at run time.

The semantics of the volatile modifier in C++ are roughly “this data object is modified by an agent outside the current thread of execution”. Which is exactly what happens here: For the vast majority of threads, the thread writing flag is different from the thread reading flag.

As a consequence of the use of volatile, the compiler needs to generate code such that every textual access to such an object results in an actual access to the physical object, in this case a shared-memory location. The data cannot be pulled into a register for faster access.

Note that volatile is generally not a replacement for synchronization primitives but may be used in conjunction with them.

Perfect, it seems it’s doing what I needed. Thank you.

CUDA time is 0.135015 s //if (threadIdx.x == 512 && a == 1000000ull
CUDA time is 124.212807 s //if (threadIdx.x == 2048 && a == 1000000ull)

For:

# define blocks 1
# define threads 1024