Hello
We have a toy example code where we try to reproduce some runtime behaviors we observed in our aplication.
We understand the following:
1 When we add a callback in a stream, anything after it will wait until the callback finishes.
2 If after the callback we add an event, and we force another stream to wait on that event, any thing after the wait event on that second stream, will wait until the callback is finished.
But we found some situations, that we don’t know how to explain, or how to interpret wat can we expect from the CUDA runtime or what are the reasons for that behavior.
Here is the code:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <cassert>
#define BUFFER_SIZE 20000000
__global__ void kernelDummy1() {
int b;
for (int j = 0; j < 100; j++)
for (int i = 0; i < 100; i++)
b = i * 2;
}
__global__ void kernelDummy2() {
int b;
for (int j = 0; j < 100; j++)
for (int i = 0; i < 100; i++)
b = i * 2;
}
__global__ void kernelDummy3() {
int b;
for (int j = 0; j < 100; j++)
for (int i = 0; i < 100; i++)
b = i * 2;
}
void CUDART_CB MyCallback (cudaStream_t stream, cudaError_t status, void *userData){
_sleep(0);
}
int main() {
cudaStream_t streamA, streamB, streamC, streamD;
cudaError_t err;
int* buffer = 0;
err = cudaMallocHost(&buffer, sizeof(int) * BUFFER_SIZE, 0);
assert(err == cudaSuccess);
for (int i = 0; i < BUFFER_SIZE; i++)
buffer[i] = i;
int* d_bufferA;
int* d_bufferB;
err = cudaMalloc(&d_bufferA, BUFFER_SIZE * sizeof(int));
assert(err == cudaSuccess);
err = cudaMalloc(&d_bufferB, BUFFER_SIZE * sizeof(int));
assert(err == cudaSuccess);
cudaEvent_t eventA;
err = cudaEventCreate(&eventA, 0);
assert(err == cudaSuccess);
err = cudaStreamCreate(&streamA);
assert(err == cudaSuccess);
err = cudaStreamCreate(&streamB);
assert(err == cudaSuccess);
err = cudaStreamCreate(&streamC);
assert(err == cudaSuccess);
err = cudaStreamCreate(&streamD);
assert(err == cudaSuccess);
//cudaDeviceSynchronize();
for (int i = 0; i < 10; i++) {
//comment this, we have different behaviour, and worse!!!
kernelDummy1<<<32, 1, 0, streamA>>>();
for (int j = 0; j < 10; j++)
cudaMemcpyAsync(d_bufferA, buffer, BUFFER_SIZE * sizeof(int), cudaMemcpyHostToDevice, streamA);
kernelDummy3<<<32, 1, 0, streamC>>>();
kernelDummy3<<<32, 1, 0, streamC>>>();
kernelDummy3<<<32, 1, 0, streamC>>>();
kernelDummy3<<<32, 1, 0, streamC>>>();
kernelDummy3<<<32, 1, 0, streamC>>>();
kernelDummy3<<<32, 1, 0, streamC>>>();
cudaStreamAddCallback(streamA, MyCallback, 0, 0);
err = cudaEventRecord(eventA, streamA);
assert(err == cudaSuccess);
err = cudaStreamWaitEvent(streamB, eventA, 0);
assert(err == cudaSuccess);
kernelDummy2<<<32, 1, 0, streamB>>>();
// kernelDummy<<<32, 1, 0, streamC>>>();
cudaDeviceSynchronize();
}
err = cudaStreamDestroy(streamA);
assert(err == cudaSuccess);
err = cudaStreamDestroy(streamB);
assert(err == cudaSuccess);
err = cudaStreamDestroy(streamC);
assert(err == cudaSuccess);
err = cudaStreamDestroy(streamD);
assert(err == cudaSuccess);
err = cudaFree(d_bufferA);
assert(err == cudaSuccess);
err = cudaFree(d_bufferB);
assert(err == cudaSuccess);
err = cudaFreeHost(buffer);
assert(err == cudaSuccess);
return 0;
}
In this example, the behavior is almost as the expected.
- First kernel scheduled on streamA executes before the callback.
- The execution of kernels in streamC is independent of all other streams, so they execute concurrently with the rest of the code.
- The kernel scheduled on stream B will be executed after the callcback.
But, only commenting the first kernel launch on streamA (line 75), makes all the kernels on ALL streams to be executed after all the memory transfers finished. WHY?
Nsight:
Code:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <cassert>
#define BUFFER_SIZE 20000000
__global__ void kernelDummy1() {
int b;
for (int j = 0; j < 100; j++)
for (int i = 0; i < 100; i++)
b = i * 2;
}
__global__ void kernelDummy2() {
int b;
for (int j = 0; j < 100; j++)
for (int i = 0; i < 100; i++)
b = i * 2;
}
__global__ void kernelDummy3() {
int b;
for (int j = 0; j < 100; j++)
for (int i = 0; i < 100; i++)
b = i * 2;
}
void CUDART_CB MyCallback (cudaStream_t stream, cudaError_t status, void *userData){
_sleep(0);
}
int main() {
cudaStream_t streamA, streamB, streamC, streamD;
cudaError_t err;
int* buffer = 0;
err = cudaMallocHost(&buffer, sizeof(int) * BUFFER_SIZE, 0);
assert(err == cudaSuccess);
for (int i = 0; i < BUFFER_SIZE; i++)
buffer[i] = i;
int* d_bufferA;
int* d_bufferB;
err = cudaMalloc(&d_bufferA, BUFFER_SIZE * sizeof(int));
assert(err == cudaSuccess);
err = cudaMalloc(&d_bufferB, BUFFER_SIZE * sizeof(int));
assert(err == cudaSuccess);
cudaEvent_t eventA;
err = cudaEventCreate(&eventA, 0);
assert(err == cudaSuccess);
err = cudaStreamCreate(&streamA);
assert(err == cudaSuccess);
err = cudaStreamCreate(&streamB);
assert(err == cudaSuccess);
err = cudaStreamCreate(&streamC);
assert(err == cudaSuccess);
err = cudaStreamCreate(&streamD);
assert(err == cudaSuccess);
//cudaDeviceSynchronize();
for (int i = 0; i < 10; i++) {
//comment this, we have different behaviour, and worse!!!
//kernelDummy1<<<32, 1, 0, streamA>>>();
for (int j = 0; j < 10; j++)
cudaMemcpyAsync(d_bufferA, buffer, BUFFER_SIZE * sizeof(int), cudaMemcpyHostToDevice, streamA);
kernelDummy3<<<32, 1, 0, streamC>>>();
kernelDummy3<<<32, 1, 0, streamC>>>();
kernelDummy3<<<32, 1, 0, streamC>>>();
kernelDummy3<<<32, 1, 0, streamC>>>();
kernelDummy3<<<32, 1, 0, streamC>>>();
kernelDummy3<<<32, 1, 0, streamC>>>();
cudaStreamAddCallback(streamA, MyCallback, 0, 0);
err = cudaEventRecord(eventA, streamA);
assert(err == cudaSuccess);
err = cudaStreamWaitEvent(streamB, eventA, 0);
assert(err == cudaSuccess);
kernelDummy2<<<32, 1, 0, streamB>>>();
// kernelDummy<<<32, 1, 0, streamC>>>();
cudaDeviceSynchronize();
}
err = cudaStreamDestroy(streamA);
assert(err == cudaSuccess);
err = cudaStreamDestroy(streamB);
assert(err == cudaSuccess);
err = cudaStreamDestroy(streamC);
assert(err == cudaSuccess);
err = cudaStreamDestroy(streamD);
assert(err == cudaSuccess);
err = cudaFree(d_bufferA);
assert(err == cudaSuccess);
err = cudaFree(d_bufferB);
assert(err == cudaSuccess);
err = cudaFreeHost(buffer);
assert(err == cudaSuccess);
return 0;
}