Questions of CUDA stream priority

zhi_xz · April 18, 2023, 12:08pm

Hi,

I created a demo to test the priority of CUDA streams. My goal is to verify that the CUDA stream scheduler always selects a task from the stream queue with a higher priority until there are no tasks left in the queue. Please see the test code below:

#include <cuda_runtime.h>
#include <cuda_runtime_api.h>
#include <stdio.h>
#include <time.h>
#include <thread>
#include <chrono>
#include <iostream>
#include <vector>
#include <unistd.h>
#include <mutex>
#include <condition_variable>
#include <functional>

static std::mutex stage_mutex;
static std::condition_variable stage_cond;
static bool g_begin = false;
static int vector_size = 1024 * 1024 * 2;

class Timer {
public:
    template<typename F, typename... Args>
    Timer(int total_time, int interval_time, F&& callback, Args&&... args) :
        total_time(total_time), interval_time(interval_time),
        callback(std::bind(std::forward<F>(callback), std::forward<Args>(args)...)) {
          kernelCount = 0;
          totalKernelTime = 0;
         }

    void start() {
        std::chrono::milliseconds interval(interval_time);
        std::chrono::milliseconds duration(total_time);
        auto start_time = std::chrono::steady_clock::now();
        auto current_time = start_time;
        while (current_time - start_time <= duration) {
            std::this_thread::sleep_for(std::chrono::milliseconds(interval_time));
            auto start = std::chrono::steady_clock::now();
            callback();
            current_time = std::chrono::steady_clock::now();
            totalKernelTime += std::chrono::duration_cast<std::chrono::milliseconds>(current_time - start).count();
            kernelCount++;
        }
    }

    float getMeanKernelTime() {
        return (totalKernelTime / static_cast<float>(kernelCount));
    }

    int getKernelCount() {
        return kernelCount;
    }

private:
    int total_time; //ms
    int interval_time; //ms
    std::function<void()> callback;
    int kernelCount;
    std::chrono::milliseconds::rep totalKernelTime;
};

void cudaCheck(cudaError_t ret, std::ostream& err = std::cerr) {
  if (ret != cudaSuccess) {
    printf("Cuda failure: %s", cudaGetErrorString(ret));
    abort();
  }
}

void sumArrays(float* a, float* b, float* res, const int size) {
  for (int i = 0; i < size; i += 4) {
    res[i] = a[i] + b[i];
    res[i + 1] = a[i + 1] + b[i + 1];
    res[i + 2] = a[i + 2] + b[i + 2];
    res[i + 3] = a[i + 3] + b[i + 3];
  }
}
void initialData(float* ip, int size) {
  time_t t;
  srand((unsigned)time(&t));
  for (int i = 0; i < size; i++) {
    ip[i] = (float)(rand() & 0xffff) / 1000.0f;
  }
}

void checkResult(float* hostRef, float* gpuRef, const int N) {
  double epsilon = 1.0E-8;
  for (int i = 0; i < N; i++) {
    if (abs(hostRef[i] - gpuRef[i]) > epsilon) {
      printf("Results don\'t match!\n");
      printf("%f(hostRef[%d] )!= %f(gpuRef[%d])\n", hostRef[i], i, gpuRef[i], i);
      return;
    }
  }
  printf("Check result success!\n");
}

__global__ void sumArraysLowP(const float*   a, const float*   b, float*   c, int32_t n) {
  for (int i = 0; i < n; i++) {
    c[i] = a[i] + b[i];
  }
}

__global__ void sumArraysHighP(const float*   a, const float*   b, float*   c, int32_t n) {
  for (int i = 0; i < n; i++) {
    c[i] = a[i] + b[i];
  }
}

__global__ void sumArraysGPU(const float*   a, const float*   b, float*   c, int32_t n) {
  int32_t idx = threadIdx.x + blockIdx.x * blockDim.x;
  if (idx < n) {
    for (int i = 0; i < 100000; i++) {
      c[idx] = a[idx] + b[idx];
    }
  }
}

void launchKernelLowP(float *a_d, float *a_h, float *b_d, float *b_h, float *res_d, float *res_from_gpu_h, int nElem, int nByte, cudaStream_t stream) {
  // cudaMemcpyAsync(a_d, a_h, nByte, cudaMemcpyHostToDevice, stream);
  // cudaMemcpyAsync(b_d, b_h, nByte, cudaMemcpyHostToDevice, stream);

  sumArraysLowP<<<1, 1, 0, stream>>>(a_d, b_d, res_d, nElem);
  // cudaMemcpyAsync(res_from_gpu_h, res_d, nByte, cudaMemcpyDeviceToHost, stream);
  // cudaCheck(cudaStreamSynchronize(stream));
}

void launchKernelHighP(float *a_d, float *a_h, float *b_d, float *b_h, float *res_d, float *res_from_gpu_h, int nElem, int nByte, cudaStream_t stream) {
  // cudaMemcpyAsync(a_d, a_h, nByte, cudaMemcpyHostToDevice, stream);
  // cudaMemcpyAsync(b_d, b_h, nByte, cudaMemcpyHostToDevice, stream);

  sumArraysHighP<<<1, 1, 0, stream>>>(a_d, b_d, res_d, nElem);
  // cudaMemcpyAsync(res_from_gpu_h, res_d, nByte, cudaMemcpyDeviceToHost, stream);
  // cudaCheck(cudaStreamSynchronize(stream));
}

void worker(int threadIdx, cudaStream_t stream, int total_time, int interval) {
  int priority;
  cudaStreamGetPriority(stream, &priority);
  printf("Stream priority of worker: %d\n", priority);
  cudaEvent_t startEvent, stopEvent;
  cudaCheck(cudaEventCreate(&startEvent));
  cudaCheck(cudaEventCreate(&stopEvent));
  int nElem = vector_size;
  // printf("Vector size:%d\n", nElem);

  int nByte = sizeof(float) * nElem;
  float* a_h = (float*)malloc(nByte);
  float* b_h = (float*)malloc(nByte);
  float* res_h = (float*)malloc(nByte);
  float* res_from_gpu_h = (float*)malloc(nByte);
  memset(res_h, 0, nByte);
  memset(res_from_gpu_h, 0, nByte);

  float *a_d, *b_d, *res_d;
  cudaMalloc((float**)&a_d, nByte);
  cudaMalloc((float**)&b_d, nByte);
  cudaMalloc((float**)&res_d, nByte);

  initialData(a_h, nElem);
  initialData(b_h, nElem);
  // printf("Thread %d begin to wait for signal\n", threadIdx);
  {
    std::unique_lock<std::mutex> lock(stage_mutex);
    while (!g_begin) {
      std::cv_status cvRet = stage_cond.wait_for(lock, std::chrono::seconds(5));
      if (cvRet == std::cv_status::timeout) {
        break;
      }
    }
  }

  // auto start = std::chrono::high_resolution_clock::now();
  cudaMemcpyAsync(a_d, a_h, nByte, cudaMemcpyHostToDevice, stream);
  cudaMemcpyAsync(b_d, b_h, nByte, cudaMemcpyHostToDevice, stream);
  Timer timer(total_time, interval, launchKernelLowP, a_d, a_h, b_d, b_h, res_d, res_from_gpu_h, nElem, nByte, stream);
  timer.start();
  // auto end = std::chrono::high_resolution_clock::now();
  std::cout << "Time form thread " << threadIdx << " : " << "launched kernel count: " << timer.getKernelCount() << ", mean latency :" <<timer.getMeanKernelTime() << " ms" << std::endl;
  cudaMemcpyAsync(res_from_gpu_h, res_d, nByte, cudaMemcpyDeviceToHost, stream);
  cudaCheck(cudaStreamSynchronize(stream));

  sumArrays(a_h, b_h, res_h, nElem);
  checkResult(res_h, res_from_gpu_h, nElem);

  // Destroy the CUDA events
  cudaEventDestroy(startEvent);
  cudaEventDestroy(stopEvent);
  free(a_h);
  free(b_h);
  free(res_h);
  free(res_from_gpu_h);
}

void higherPriorityWorker(cudaStream_t stream, int total_time, int interval) {
  int priority;
  cudaStreamGetPriority(stream, &priority);
  printf("Stream priority of higherPriorityWorker: %d\n", priority);
  cudaEvent_t startEvent, stopEvent;
  cudaCheck(cudaEventCreate(&startEvent));
  cudaCheck(cudaEventCreate(&stopEvent));
  int nElem = vector_size;
  // printf("Vector size:%d\n", nElem);

  int nByte = sizeof(float) * nElem;
  float* a_h = (float*)malloc(nByte);
  float* b_h = (float*)malloc(nByte);
  float* res_h = (float*)malloc(nByte);
  float* res_from_gpu_h = (float*)malloc(nByte);
  memset(res_h, 0, nByte);
  memset(res_from_gpu_h, 0, nByte);

  float *a_d, *b_d, *res_d;
  cudaMalloc((float**)&a_d, nByte);
  cudaMalloc((float**)&b_d, nByte);
  cudaMalloc((float**)&res_d, nByte);

  initialData(a_h, nElem);
  initialData(b_h, nElem);
  // printf("Thread %d begin to wait for signal\n", threadIdx);
  {
    std::unique_lock<std::mutex> lock(stage_mutex);
    while (!g_begin) {
      std::cv_status cvRet = stage_cond.wait_for(lock, std::chrono::seconds(5));
      if (cvRet == std::cv_status::timeout) {
        break;
      }
    }
  }

  // auto start = std::chrono::high_resolution_clock::now();
  cudaMemcpyAsync(a_d, a_h, nByte, cudaMemcpyHostToDevice, stream);
  cudaMemcpyAsync(b_d, b_h, nByte, cudaMemcpyHostToDevice, stream);
  Timer timer(total_time, interval, launchKernelHighP, a_d, a_h, b_d, b_h, res_d, res_from_gpu_h, nElem, nByte, stream);
  timer.start();
  cudaMemcpyAsync(res_from_gpu_h, res_d, nByte, cudaMemcpyDeviceToHost, stream);
  cudaCheck(cudaStreamSynchronize(stream));
  // auto end = std::chrono::high_resolution_clock::now();

  std::cout << "Time form higher priority thread" << " : " << "launched kernel count: " << timer.getKernelCount() << ", mean latency :" <<timer.getMeanKernelTime() << " ms" << std::endl;
  // std::cout << "Time form higher priority thread : " << " " << timer.getMeanKernelTime() << " ms" << std::endl;

  // Destroy the CUDA events
  cudaEventDestroy(startEvent);
  cudaEventDestroy(stopEvent);
  free(a_h);
  free(b_h);
  free(res_h);
  free(res_from_gpu_h);
}

int main(int argc, char** argv) {
  printf("Usage: ./priority_test [thread_num] [priority] [total_time] [interval] [highPriorityInterval].\n");

  // get leastest priority and highest priority
  int g_lowestPriority;
  int g_highestPriority;
  cudaDeviceGetStreamPriorityRange(&g_lowestPriority, &g_highestPriority);
  printf("lowest priority: %d, highest priority: %d\n", g_lowestPriority, g_highestPriority);
  cudaSetDevice(0);
  int thread_num = 1;
  int priority = -1;
  int total_time = 5000;
  int interval = 100;
  int highPriorityInterval = 100;
  if (argc == 2) {
    thread_num = atoi(argv[1]);
  } else if (argc == 3) {
    thread_num = atoi(argv[1]);
    priority = atoi(argv[2]);
  } else if (argc > 3){
    thread_num = atoi(argv[1]);
    priority = atoi(argv[2]);
    total_time = atoi(argv[3]);
    interval = atoi(argv[4]);
    highPriorityInterval = atoi(argv[5]);
  }

  printf("Thread num : %d\n", thread_num);
  printf("priority : %d\n", priority);
  std::vector<cudaStream_t> streams;
  for (int i = 0; i < thread_num; i++) {
    cudaStream_t stream;
    cudaStreamCreateWithPriority(&stream, cudaStreamNonBlocking, 0);
    // cudaStreamCreate(&stream);
    streams.push_back(stream);
  }
  // ceate thread
  std::vector<std::thread> threads;
  for (int i = 0; i < thread_num; i++) {
    threads.push_back(std::thread(worker, i, streams[i], total_time, interval));
  }
  // create worker with higher priority
  cudaStream_t higherPriorityStream;
  cudaStreamCreateWithPriority(&higherPriorityStream, cudaStreamNonBlocking, priority);
  // cudaStreamCreate(&higherPriorityStream);
  threads.push_back(std::thread(higherPriorityWorker, higherPriorityStream, total_time, highPriorityInterval));

  {
    std::this_thread::sleep_for(std::chrono::milliseconds(1000));
    std::unique_lock<std::mutex> lock(stage_mutex);
    g_begin = true;
    printf("Notify all threads to start\n");
    stage_cond.notify_all();
  }
  auto start = std::chrono::high_resolution_clock::now();

  // join thread
  for (int i = 0; i < thread_num + 1; i++) {
    threads[i].join();
  }
  auto end = std::chrono::high_resolution_clock::now();

  std::cout << "Total time: " << ((end - start).count() / 1e6) << " ms" << std::endl;
  // destroy streams
  cudaStreamDestroy(higherPriorityStream);
  for (int i = 0; i < thread_num; i++) {
    cudaStreamDestroy(streams[i]);
  }
  cudaDeviceReset();
  return 0;
}

To build and run this code, please use the command line below:

nvcc -o priority_test priority_test.cu    
./priority_test 2 -5 5000 500 10

The test launches two threads with low-priority CUDA streams and one thread with a higher-priority CUDA stream. A timer is set for a duration of 5000 ms. During this duration, the threads with lower priority launch a kernel every 500 ms, while the thread with higher priority launches a kernel every 10 ms.
In this test, the stream scheduler should prioritize taking tasks from the queue of higher-priority CUDA streams first, and only take tasks from the queue of lower-priority streams when there are no tasks left in the higher-priority queue. However, based on the profiling results from nsys, it seems that the low-priority stream was not preempted by the high-priority stream, and the kernel was still executed every 500ms.
Please see the profile results in the image below.

Where did I go wrong in the process? Or is it possible that the conclusion itself - that the CUDA stream scheduler always prioritizes the high-priority stream - is incorrect?

Robert_Crovella · April 18, 2023, 2:01pm

There are no guarantees of preemption.

zhi_xz · April 18, 2023, 2:26pm

Thanks for your quick response.

If higher priority CUDA streams offer no assurance of preemption, what benefits do priority mechanisms bring to us?

Robert_Crovella · April 18, 2023, 2:40pm

Perhaps none.

Full stop. Let’s let that one sink in.

The guaranteed behavior is that the CWD (block scheduler) will preferentially choose blocks from higher priority streams, when depositing new blocks on SMs.

The CWD/GPU may use preemption, but it is not guaranteed. Here is the description in the programming guide:

At runtime, pending work in higher-priority streams takes preference over pending work in low-priority streams.

Note that the word “preemption” does not appear there.
Note that the behavioral description impacts pending work, and the behavior description compares pending work to pending work, not pending work to scheduled/deposited/active work.

If you write a kernel which fully occupies the GPU, and enough threadblocks get scheduled (i.e. actually deposited on an SM, so they are no longer “pending”) and those threadblocks run for a long time, and then along comes a higher priority kernel, there is no guarantee (that I know of) that the CWD/block scheduler will preempt a block, physically removing it from an SM, to “make way” for a block of the recently arrived higher priority kernel.

However, if you follow the design directions I gave here, then it may be that you generally get improved throughput of higher priority kernels.

Yes, I acknowledge the statement here:

Work in a higher priority stream may preempt work already executing in a low priority stream.

In english, may does not mean must or will. If you don’t observe preemption in your setting, no guarantees were broken or violated.

zhi_xz · April 19, 2023, 3:11am

I apologize for using the wrong term ‘preemption’ and for not making my point clear.
As you said, CWD will prioritize pending tasks from high-priority CUDA streams. In my testing, high-priority threads emit a task every 10ms, while low-priority threads emit a task every 500ms. Because high-priority tasks occur much more frequently, low-priority tasks will not be immediately selected for execution after being emitted.

For example, let’s say there are two threads, A and B, with A having a higher priority than B.
Both threads emit the first kernel function at the start, and after 500ms, thread B emits the second kernel function. However, by this time, thread A has already emitted 50 kernel functions (which cannot be executed in 500ms). As a result, thread B has to wait for its low-priority task to be selected for execution since there are still high-priority tasks pending. Therefore, the time interval between the start of the first kernel function and the start of the second kernel function on thread B on the timeline should be much greater than 500ms. However, from the figure above, it can be seen that their intervals are still 500ms, and furthermore, the execution time intervals of all kernel functions on thread B are also 500ms, indicating that they are not affected by the high-priority CUDA stream.

This is my main confusion, and I hope I have explained it clearly.

Robert_Crovella · April 19, 2023, 4:01am

The profiler suggests your kernels are running concurrently. Your kernel launch config for each appears to be <<<1,1>>>, indicating negligible resource usage. Why would you expect that to not happen if they are launched into separate streams and there are sufficient resources?

The execution of kernels in a particular created stream does not in and of itself prevent the simultaneous execution of kernels in another (created) stream. At all times your GPU has plenty of spare capacity, and so the kernels launched into the high priority stream, due to stream semantics, will not overlap each other – they will never at any moment consume more than negligible resources. Therefore that stream consumes negligible resources. The lower priority stream is using those available resources, which can never be consumed by the high priority stream.

njuffa · April 19, 2023, 4:37am

I have no insight into the internal workings of the CUDA stream prioritization mechanism. I note that the wording of the specification, using may as the operative word, is consistent with how functionality of this kind is generally specified.

Based on experience with other prioritization mechanisms offered to programmers (notably OpenGL’s glPrioritizeTextures() and C’s register keyword), I think that it is entirely possible that the CUDA runtime pretty much ignores what is being specified and instead uses its own internal heuristics for maximizing device utilization while attempting to minimize latency. As a corollary, such heuristics, being implementation artifacts, are typically subject to change over time.

zhi_xz · April 19, 2023, 7:56am

Note that the behavioral description impacts pending work, and the behavior description compares pending work to pending work, not pending work to scheduled/deposited/active work.

I understand your point about the impact on pending work now. Since the kernel uses minimal resources, it can be scheduled as soon as it is launched, even if it is from a low-priority stream.

By changing the launch configuration to <<68 * 16, 1>>> for both kernels and maintaining the frequency of kernel launch, the kernels will no longer execute concurrently. This launch configuration will also cause the kernel status to become pending if there is already a kernel running. I have re-profiled it and below is a screenshot of the results.

At time 0 ms, the high-priority stream begins launching the kernel every 10 ms.
At time 500 ms, the high-priority stream has launched 50 kernels and executed 6 of them, with the 7th currently executing. Therefore, there are 43 kernels pending.
At time 500 ms, the low-priority stream launches a kernel. However, since there are not enough resources to execute the kernel, it becomes pending.
As shown in the figure above, the “Schedule time” indicates the time at which there are 43 pending kernels in the high-priority stream and 1 pending kernel in the low-priority stream.
As description in the programming guide:

At runtime, pending work in higher-priority streams takes preference over pending work in low-priority streams.

The CWD is supposed to select the kernel from the high-priority stream rather than the low-priority stream at the ‘Schedule time.’ However, based on the profile results, it appears that the kernel from the low-priority stream was selected.

It seems like the priority mechanism is not working.

Looking forward to your response. Thanks.

Robert_Crovella · April 19, 2023, 9:08am

I don’t know that I have time to wade through your code. Here is my simpler test case:

$ cat t2.cu
#include <iostream>
#include <cstdlib>
__global__ void k(unsigned long long dt){

  unsigned long long start = clock64();
  while (clock64() < (start+dt));
}

int main(int argc, char *argv[]){

  cudaStream_t h, l;
  int hp, lp;
  cudaDeviceGetStreamPriorityRange(&lp, &hp);
  std::cout << "lowest priority: " << lp << " highest priority: " << hp << std::endl;
  cudaStreamCreateWithPriority(&h, cudaStreamDefault, hp);
  cudaStreamCreateWithPriority(&l, cudaStreamDefault, lp);
  unsigned long long dt = 10000000ULL;
  int blocks = 26*5;
  if (argc > 1) dt *= atoi(argv[1]);
  if (argc > 2) blocks = 1;
  for (int i = 0; i < 20; i++) k<<<blocks, 1024,0,h>>>(dt);
  k<<<blocks, 1024, 0, l>>>(dt);
  cudaDeviceSynchronize();
}
$ /usr/local/cuda/bin/nvcc t2.cu -o t2
$ /usr/local/cuda/bin/nsys nvprof --print-gpu-trace ./t2
WARNING: t2 and any of its children processes will be profiled.

lowest priority: 0 highest priority: -5
Generating '/tmp/nsys-report-5b2c.qdstrm'
[1/3] [========================100%] report7.nsys-rep
[2/3] [========================100%] report7.sqlite
[3/3] Executing 'cuda_gpu_trace' stats report

  Start (ns)    Duration (ns)  CorrId  GrdX  GrdY  GrdZ  BlkX   BlkY  BlkZ  Reg/Trd  StcSMem (MB)  DymSMem (MB)  Bytes (MB)  Throughput (MBps)  SrcMemKd  DstMemKd            Device            Ctx  Strm          Name
 -------------  -------------  ------  ----  ----  ----  -----  ----  ----  -------  ------------  ------------  ----------  -----------------  --------  --------  --------------------------  ---  ----  ---------------------
   354,758,945     42,950,465     122   130     1     1  1,024     1     1        8         0.000         0.000                                                     NVIDIA GeForce GTX 970 (0)    1    13  k(unsigned long long)
   396,886,197    212,214,300     142   130     1     1  1,024     1     1        8         0.000         0.000                                                     NVIDIA GeForce GTX 970 (0)    1    14  k(unsigned long long)
   405,322,904     44,144,518     123   130     1     1  1,024     1     1        8         0.000         0.000                                                     NVIDIA GeForce GTX 970 (0)    1    13  k(unsigned long long)
   455,988,524     45,221,198     124   130     1     1  1,024     1     1        8         0.000         0.000                                                     NVIDIA GeForce GTX 970 (0)    1    13  k(unsigned long long)
   506,639,361     46,315,766     125   130     1     1  1,024     1     1        8         0.000         0.000                                                     NVIDIA GeForce GTX 970 (0)    1    13  k(unsigned long long)
   557,294,486     44,034,248     126   130     1     1  1,024     1     1        8         0.000         0.000                                                     NVIDIA GeForce GTX 970 (0)    1    13  k(unsigned long long)
   604,563,862     43,371,543     127   130     1     1  1,024     1     1        8         0.000         0.000                                                     NVIDIA GeForce GTX 970 (0)    1    13  k(unsigned long long)
   647,936,302     38,853,211     128   130     1     1  1,024     1     1        8         0.000         0.000                                                     NVIDIA GeForce GTX 970 (0)    1    13  k(unsigned long long)
   686,790,506     38,869,307     129   130     1     1  1,024     1     1        8         0.000         0.000                                                     NVIDIA GeForce GTX 970 (0)    1    13  k(unsigned long long)
   725,660,805     37,520,666     130   130     1     1  1,024     1     1        8         0.000         0.000                                                     NVIDIA GeForce GTX 970 (0)    1    13  k(unsigned long long)
   763,182,304     36,686,732     131   130     1     1  1,024     1     1        8         0.000         0.000                                                     NVIDIA GeForce GTX 970 (0)    1    13  k(unsigned long long)
   799,869,836     36,694,284     132   130     1     1  1,024     1     1        8         0.000         0.000                                                     NVIDIA GeForce GTX 970 (0)    1    13  k(unsigned long long)
   836,564,920     36,706,413     133   130     1     1  1,024     1     1        8         0.000         0.000                                                     NVIDIA GeForce GTX 970 (0)    1    13  k(unsigned long long)
   873,272,229     36,706,476     134   130     1     1  1,024     1     1        8         0.000         0.000                                                     NVIDIA GeForce GTX 970 (0)    1    13  k(unsigned long long)
   909,979,505     36,707,500     135   130     1     1  1,024     1     1        8         0.000         0.000                                                     NVIDIA GeForce GTX 970 (0)    1    13  k(unsigned long long)
   946,687,805     36,712,012     136   130     1     1  1,024     1     1        8         0.000         0.000                                                     NVIDIA GeForce GTX 970 (0)    1    13  k(unsigned long long)
   983,400,585     36,711,308     137   130     1     1  1,024     1     1        8         0.000         0.000                                                     NVIDIA GeForce GTX 970 (0)    1    13  k(unsigned long long)
 1,020,112,757     36,713,068     138   130     1     1  1,024     1     1        8         0.000         0.000                                                     NVIDIA GeForce GTX 970 (0)    1    13  k(unsigned long long)
 1,056,826,689     36,715,660     139   130     1     1  1,024     1     1        8         0.000         0.000                                                     NVIDIA GeForce GTX 970 (0)    1    13  k(unsigned long long)
 1,093,543,213     36,713,580     140   130     1     1  1,024     1     1        8         0.000         0.000                                                     NVIDIA GeForce GTX 970 (0)    1    13  k(unsigned long long)
 1,130,257,593     36,714,348     141   130     1     1  1,024     1     1        8         0.000         0.000                                                     NVIDIA GeForce GTX 970 (0)    1    13  k(unsigned long long)

Generated:
    /home/bob/report7.nsys-rep
    /home/bob/report7.sqlite
$

(CUDA 12.1, GTX 970, Ubuntu 22.04)

Here is what I see happening. The kernels in the high priority stream (13) are serialized due to stream semantics. However most kernel launches will have a tail effect. That means that towards the end of the kernel execution duration, blocks start to finish and retire, but not all blocks have finished. This leaves “empty space” on the GPU, during which time the next high-priority kernel cannot begin due to stream semantics. Therefore the block scheduler starts to schedule blocks from the lower priority kernel. However we observe that this kernel must not be running with “high” priority, because its duration is much longer than a high-priority kernel. So the block scheduler is scheduling blocks from the lower priority kernel and it “fills in the empty spaces” in the execution of the higher priority kernels. But the block scheduler is evidently preferring blocks from the high priority kernel/stream, when it is legal to schedule them. (In the example above, the lower priority kernel appears to have its execution “smeared out” over the duration of approximately 5 instances of the high priority kernel.)

I’m not sure I’ll be able to explain every nuance of stream priority behavior. So I may not respond further. But I don’t believe the example above is explainable in any other way, unless there is some stream priority effect. Good luck!

zhi_xz · April 19, 2023, 10:01am

Your explanation is very convincing, and I appreciate your patience in explaining it to me. Thank you once again.

system · May 3, 2023, 10:01am

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.

Topic		Replies	Views
How high priority stream preemption CUDA Programming and Performance	12	6424	November 30, 2022
Why does cudaStreamAddCallback serialize kernel execution and break concurrency? CUDA Programming and Performance	12	7932	April 5, 2015
Time intervals and non-concurrent in multi streaming CUDA Programming and Performance cuda	6	563	April 6, 2023
Cannot get any stream parallelism. CUDA Programming and Performance	13	1248	December 31, 2019
How to choose how many threads/blocks to have? CUDA Programming and Performance	43	50981	June 7, 2022
GPU Pro Tip: CUDA 7 Streams Simplify Concurrency Technical Blog	51	2064	February 5, 2020
Stream Benchmark CUDA Programming and Performance	20	16930	March 15, 2023
Question about CUDA kernels parallel execution CUDA Programming and Performance cuda , parallel-computing	7	2459	April 27, 2024
Overlapping CPU and GPU operations using streams. Total failure. Any help? CUDA Programming and Performance	6	5974	April 2, 2013
Unexplained stalls in CUDA API calls - reproducer attached Jetson TK1	27	2923	October 18, 2021

Questions of CUDA stream priority

Related topics