Hello,
I stumbled upon a weird behavior. See the code:
#include <cstdio>
#include <thread>
#include <chrono>
#include <omp.h>
#include <cuda_runtime.h>
#define CHECK(status) do { _check((status), __FILE__, __LINE__); } while(false)
inline void _check(cudaError_t error_code, const char *file, int line)
{
if (error_code != cudaSuccess)
{
fprintf(stderr, "CUDA Error %d %s: %s. In file '%s' on line %d\n", error_code, cudaGetErrorName(error_code), cudaGetErrorString(error_code), file, line);
fflush(stderr);
exit(1);
}
}
__global__ void long_kernel(double * xx)
{
double & x = *xx;
for(size_t i = 0; i < 100000000; i++)
{
x = x * x - 0.5;
}
}
int main(int argc, char ** argv)
{
cudaStream_t streamA;
cudaStream_t streamB;
CHECK(cudaStreamCreate(&streamA));
CHECK(cudaStreamCreate(&streamB));
double * d_dataA;
CHECK(cudaMalloc(&d_dataA, sizeof(double)));
double * d_dataB;
CHECK(cudaMalloc(&d_dataB, sizeof(double)));
double dataA;
double dataB;
for(int r = 0; r < 3; r++)
{
printf("Test #%d\n", r+1);
double total_start = omp_get_wtime();
#pragma omp parallel num_threads(2)
{
int i = omp_get_thread_num();
if(i == 0)
{
printf(" Thread A submitting kernel\n");
double kernel_start = omp_get_wtime();
long_kernel<<<1,1,0,streamA>>>(d_dataA);
double kernel_stop = omp_get_wtime();
printf(" Thread A kernel submitted, %.3f ms\n", (kernel_stop-kernel_start)*1000.0);
// printf(" Thread A synchronizing stream\n");
// double streamsync_start = omp_get_wtime();
// CHECK(cudaStreamSynchronize(streamA));
// double streamsync_stop = omp_get_wtime();
// printf(" Thread A stream synchronized, %.3f ms\n", (streamsync_stop-streamsync_start)*1000.0);
printf(" Thread A submitting memcpy\n");
double memcpy_start = omp_get_wtime();
CHECK(cudaMemcpyAsync(&dataA, d_dataA, sizeof(double), cudaMemcpyDeviceToHost, streamA));
double memcpy_stop = omp_get_wtime();
printf(" Thread A memcpy submitted, %.3f ms\n", (memcpy_stop-memcpy_start)*1000.0);
}
if(i == 1)
{
printf(" Thread B sleeping\n");
double sleep_start = omp_get_wtime();
std::this_thread::sleep_for(std::chrono::milliseconds(100));
double sleep_stop = omp_get_wtime();
printf(" Thread B slept, %.3f ms\n", (sleep_stop-sleep_start)*1000.0);
printf(" Thread B submitting memcpy\n");
double memcpy_start = omp_get_wtime();
CHECK(cudaMemcpyAsync(&dataB, d_dataB, sizeof(double), cudaMemcpyDeviceToHost, streamB));
double memcpy_stop = omp_get_wtime();
printf(" Thread B memcpy submitted, %.3f ms\n", (memcpy_stop-memcpy_start)*1000.0);
}
}
double sync_start = omp_get_wtime();
CHECK(cudaDeviceSynchronize());
double sync_stop = omp_get_wtime();
printf(" Synchronize: %.3f ms\n", (sync_stop-sync_start)*1000.0);
double total_stop = omp_get_wtime();
printf(" Total time: %.3f ms\n", (total_stop-total_start)*1000.0);
}
CHECK(cudaFree(d_dataA));
CHECK(cudaFree(d_dataB));
CHECK(cudaStreamDestroy(streamA));
CHECK(cudaStreamDestroy(streamB));
return 0;
}
I first create two cudaStream
s and allocate two doubles in device memory. Then I launch two threads. Each thread uses one of the streams. The first thread submits a ~470ms long kernel to the device, and then submits a cudaMemcpyAsync
which copies the result to host. The second thread does completely unrelated things - it first sleeps for 100ms, and then it copies a different variable from device to host using cudaMemcpyAsync
. This is done 3 times to get rid of warmup overheads.
The host memory is pagable, so the cudaMemcpyAsync will actually be synchronous and the cudaMemcpyAsync function will wait for that memory transfer to finish. I am OK with that.
Compile with nvcc -g -O2 -Xcompiler -fopenmp source.cu -o program.x
and run with ./program.x
.
Expected output:
...
Test #3
Thread B sleeping
Thread A submitting kernel
Thread A kernel submitted, 0.004 ms
Thread A submitting memcpy
Thread B slept, 100.052 ms
Thread B submitting memcpy
Thread B memcpy submitted, 0.013 ms
Thread A memcpy submitted, 467.089 ms
Synchronize: 0.003 ms
Total time: 467.131 ms
That is, I expect that the memcpy in thread B executes and copies the data immediately, since nothing is blocking the stream and no other memory is being transferred (thread A is still executing the kernel at that time, the memcpy happens only after that).
Actual output:
...
Test #3
Thread B sleeping
Thread A submitting kernel
Thread A kernel submitted, 0.004 ms
Thread A submitting memcpy
Thread B slept, 100.052 ms
Thread B submitting memcpy
Thread A memcpy submitted, 467.089 ms
Thread B memcpy submitted, 367.062 ms
Synchronize: 0.003 ms
Total time: 467.131 ms
That is, threadB’s memcpy is waiting for the threadA’s memcpy to finish for some reason.
Why is threadB’s memcpy waiting for threadA’s memcpy to finish? At the time when threadB wants to do the memcpy, streamA is executing the kernel on the device, no memcpy is actually being performed in streamA at that time, that could interfere with the threadB’s memcpy.
Why is this happening? Is this behavior expected?
It seems like the cudaMemcpyAsync operations with pageable host memory are ordered the same way as they were submitted. They are not executed independently as their time comes in their respective streams.
A simple workaround is to put a cudaStreamSynchronize(streamA)
in between the kernel and the memcpy in threadA’s code (the commented lines in the attached code). Then the threadB’s memcpy happens right away, and threadA’s runtime does not change. But why do I have to do that?
This was just a simple demonstration of the core of the problem. In my real application, threadA contains cusparseSpSM_analysis
(which basically launches some kernels and performs pageable memcpy, according to nsight), and threadB is trying to submit a host function (cudaLaunchHostFunc
), which I think uses the pageable memcpy on the inside (although I am not sure, but the behavior is the same if I replace the memcpy with cudalaunchostfunc). The ordering of the threadA/threadB execution does not match as exactly as in the original example, but the cudaLaunchHostFunc
takes almost the same time as the cusparseSpSM_analysis
- as if it was waiting for it to finish, despite being in two unrelated streams. I am unable to use the cudaStreamSynchronize workaround there, because I would need to modify the cusparseSpSM_analysis
function, which I can’t. Here is the code for that problem:
#include <cstdio>
#include <thread>
#include <chrono>
#include <vector>
#include <omp.h>
#include <cuda_runtime.h>
#include <cusparse.h>
#define CHECK(status) do { _check((status), __FILE__, __LINE__); } while(false)
inline void _check(cudaError_t error_code, const char *file, int line)
{
if (error_code != cudaSuccess)
{
fprintf(stderr, "CUDA Error %d %s: %s. In file '%s' on line %d\n", error_code, cudaGetErrorName(error_code), cudaGetErrorString(error_code), file, line);
fflush(stderr);
exit(1);
}
}
inline void _check(cusparseStatus_t status, const char *file, int line)
{
if (status != CUSPARSE_STATUS_SUCCESS) {
fprintf(stderr, "CUSPARSE Error %d %s: %s. In file '%s' on line %d\n", status, cusparseGetErrorName(status), cusparseGetErrorString(status), file, line);
fflush(stderr);
exit(1);
}
}
struct trsm_system
{
int size;
int nrhs;
int nnz;
int * A_rowptrs = nullptr;
int * A_colidxs = nullptr;
double * A_vals = nullptr;
double * B = nullptr;
double * X = nullptr;
};
trsm_system load_system(const char * file)
{
FILE * f = fopen(file, "r");
if(f == nullptr) throw std::runtime_error("cannot open matrix file");
trsm_system sys;
fscanf(f, "%d%d%d", &sys.size, &sys.nrhs, &sys.nnz);
std::vector<int> rowptrs(sys.size+1);
std::vector<int> colidxs(sys.nnz);
std::vector<double> vals(sys.nnz);
for(int r = 0; r <= sys.size; r++) fscanf(f, "%d", &rowptrs[r]);
for(int i = 0; i < sys.nnz; i++) fscanf(f, "%d", &colidxs[i]);
for(int i = 0; i < sys.nnz; i++) fscanf(f, "%lf", &vals[i]);
fclose(f);
std::vector<double> B(sys.size * sys.nrhs);
for(int i = 0; i < B.size(); i++) B[i] = (double)rand() / RAND_MAX;
CHECK(cudaMalloc(&sys.A_rowptrs, (sys.size + 1) * sizeof(int)));
CHECK(cudaMalloc(&sys.A_colidxs, sys.nnz * sizeof(int)));
CHECK(cudaMalloc(&sys.A_vals, sys.nnz * sizeof(double)));
CHECK(cudaMalloc(&sys.B, sys.size * sys.nrhs * sizeof(double)));
CHECK(cudaMalloc(&sys.X, sys.size * sys.nrhs * sizeof(double)));
CHECK(cudaMemcpy(sys.A_rowptrs, rowptrs.data(), (sys.size + 1) * sizeof(int), cudaMemcpyHostToDevice));
CHECK(cudaMemcpy(sys.A_colidxs, colidxs.data(), sys.nnz * sizeof(int), cudaMemcpyHostToDevice));
CHECK(cudaMemcpy(sys.A_vals, vals.data(), sys.nnz * sizeof(double), cudaMemcpyHostToDevice));
CHECK(cudaMemcpy(sys.B, B.data(), sys.size * sys.nrhs * sizeof(double), cudaMemcpyHostToDevice));
return sys;
}
void clear_system(trsm_system & sys)
{
CHECK(cudaFree(sys.A_rowptrs));
CHECK(cudaFree(sys.A_colidxs));
CHECK(cudaFree(sys.A_vals));
CHECK(cudaFree(sys.B));
CHECK(cudaFree(sys.X));
}
int main(int argc, char ** argv)
{
trsm_system sys = load_system("matrix25.txt");
cudaStream_t streamA;
cudaStream_t streamB;
CHECK(cudaStreamCreate(&streamA));
CHECK(cudaStreamCreate(&streamB));
for(int r = 0; r < 3; r++)
{
printf("Test #%d\n", r+1);
cusparseHandle_t handle;
CHECK(cusparseCreate(&handle));
CHECK(cusparseSetStream(handle, streamA));
cusparseSpMatDescr_t descr_A;
CHECK(cusparseCreateCsr(&descr_A, sys.size, sys.size, sys.nnz, sys.A_rowptrs, sys.A_colidxs, sys.A_vals, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F));
auto lower = CUSPARSE_FILL_MODE_LOWER;
auto nonunit = CUSPARSE_DIAG_TYPE_NON_UNIT;
CHECK(cusparseSpMatSetAttribute(descr_A, CUSPARSE_SPMAT_FILL_MODE, &lower, sizeof(lower)));
CHECK(cusparseSpMatSetAttribute(descr_A, CUSPARSE_SPMAT_DIAG_TYPE, &nonunit, sizeof(nonunit)));
cusparseDnMatDescr_t descr_B, descr_X;
CHECK(cusparseCreateDnMat(&descr_B, sys.size, sys.nrhs, sys.nrhs, sys.B, CUDA_R_64F, CUSPARSE_ORDER_ROW));
CHECK(cusparseCreateDnMat(&descr_X, sys.size, sys.nrhs, sys.nrhs, sys.X, CUDA_R_64F, CUSPARSE_ORDER_ROW));
cusparseSpSMDescr_t descr_spsm;
CHECK(cusparseSpSM_createDescr(&descr_spsm));
cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
cusparseOperation_t opB = CUSPARSE_OPERATION_NON_TRANSPOSE;
double one = 1.0;
size_t buffersize;
void * buffer;
CHECK(cusparseSpSM_bufferSize(handle, opA, opB, &one, descr_A, descr_B, descr_X, CUDA_R_64F, CUSPARSE_SPSM_ALG_DEFAULT, descr_spsm, &buffersize));
CHECK(cudaDeviceSynchronize());
CHECK(cudaMalloc(&buffer, buffersize));
double total_start = omp_get_wtime();
#pragma omp parallel num_threads(2)
{
int i = omp_get_thread_num();
if(i == 0)
{
printf(" Thread A analysis starting\n");
double kernel_start = omp_get_wtime();
CHECK(cusparseSpSM_analysis(handle, opA, opB, &one, descr_A, descr_B, descr_X, CUDA_R_64F, CUSPARSE_SPSM_ALG_DEFAULT, descr_spsm, buffer));
double kernel_stop = omp_get_wtime();
printf(" Thread A analysis finished, %.3f ms\n", (kernel_stop-kernel_start)*1000.0);
}
if(i == 1)
{
printf(" Thread B sleeping\n");
double sleep_start = omp_get_wtime();
std::this_thread::sleep_for(std::chrono::milliseconds(10));
double sleep_stop = omp_get_wtime();
printf(" Thread B slept, %.3f ms\n", (sleep_stop-sleep_start)*1000.0);
printf(" Thread B submitting host func\n");
double memcpy_start = omp_get_wtime();
CHECK(cudaLaunchHostFunc(streamB, [](void*){
printf(" Thread B host func exec\n");
}, nullptr));
double memcpy_stop = omp_get_wtime();
printf(" Thread B host func submitted, %.3f ms\n", (memcpy_stop-memcpy_start)*1000.0);
}
}
double sync_start = omp_get_wtime();
CHECK(cudaDeviceSynchronize());
double sync_stop = omp_get_wtime();
printf(" Synchronize: %.3f ms\n", (sync_stop-sync_start)*1000.0);
double total_stop = omp_get_wtime();
printf(" Total time: %.3f ms\n", (total_stop-total_start)*1000.0);
CHECK(cudaFree(buffer));
CHECK(cusparseSpSM_destroyDescr(descr_spsm));
CHECK(cusparseDestroyDnMat(descr_B));
CHECK(cusparseDestroyDnMat(descr_X));
CHECK(cusparseDestroySpMat(descr_A));
CHECK(cusparseDestroy(handle));
}
CHECK(cudaStreamDestroy(streamA));
CHECK(cudaStreamDestroy(streamB));
clear_system(sys);
return 0;
}
Compile with nvcc -Wno-deprecated-declarations -g -O2 -Xcompiler -fopenmp source_realworld.cu -o program_real.x -lcusparse
. Also here is the matrix file that is used in the program: matrix25.txt (98.0 MB)
I use CUDA 12.4.0 with its libraries, gpu driver 550.54.15, A100-SXM4-40GB gpu.
Will provide additional details or explanation if needed.
Thanks in advance for help,
Jakub