Cuda launch kernel failed when there is memory dependency

I am trying to implement a toy example that includes two Cuda kernels. One is the worker, the other is the signal sender. The demo code is listed as the following. It seems that the prempt kernel cannot execute at all. Is there any underlying lock mechanism to avoid such concurrent access? Thank you!

#include <stdio.h>
#include <assert.h>
#include <unistd.h>

// CUDA runtime
#include <cuda_runtime.h>

using clock_value_t = long long;

__global__ void worker(int* is_stop_dev) {
	printf("Worker enter\n");
	while (*is_stop_dev != 1) {
		clock_value_t sleep_cycles = 10000000000;
		clock_value_t start = clock64();
        clock_value_t cycles_elapsed;
		// printf("Sleep...\n");
        do { cycles_elapsed = clock64() - start; } 
        while (cycles_elapsed < sleep_cycles);
	printf("Exit success!\n");

__global__ void prempt(int* is_stop_dev) {
	printf("Prempt enter\n");
	*is_stop_dev = 1;
	// atomicAdd(is_stop, 1);
	printf("Set the stop flag\n");

int main(int argc, char **argv) {
	int is_stop_host = 0;
	int* is_stop_dev;
	cudaMalloc(reinterpret_cast<void **>(&is_stop_dev), sizeof(int));

	cudaMemcpy(is_stop_dev, &is_stop_host, sizeof(int), cudaMemcpyHostToDevice);
	worker<<<1, 1>>>(is_stop_dev);
	printf("Launch the prempt kernel...\n");
	cudaStream_t pstream;
	prempt<<<1, 1, 0, pstream>>>(is_stop_dev);

	// cudaFree(is_stop_dev);

By default, streams cannot run concurrently with the default stream. You can either use a second non-default stream for the worker kernel, or create the pstream with flag cudaStreamNonBlocking

Thank you so much!