Our matrices are the same size, for example 256x256, 512x512 as large as 2048x2048. We have perhaps 100 matrices to compute in a set. The SVD is one part of a larger processing chain.
I have tried using multiple host threads, each with its own handle and stream and have not seen any amount of overlapping. I have tried small matrices also (64x64) to see if it was a resource issue. I’m attaching an example of how I created the test; i have tried variants with similar results.
void TestParallelSVD()
{
constexpr int XRES = 256;
auto pfcInputData = std::make_unique<cuComplex>(XRES * XRES);
// Create dummy input
for (int i = 0; i < XRES * XRES; ++i)
{
pfcInputData[i].x = static_cast<float>(i % 316);
pfcInputData[i].y = static_cast<float>(i % 247);
}
struct SVD_INSTANCE
{
cuComplex* h_pfcSrc;
cuComplex* h_pfcU;
cuComplex* d_svdWork;
cuComplex* d_fcV;
cuComplex* d_fcZ;
float* d_svdRWork;
float* d_fSvdS;
cuComplex* d_fcSvdU;
int* d_svdInfo;
cusolverDnHandle_t d_cusolverHandle;
cudaStream_t stream;
int lwork;
int id;
};
constexpr int iNumThreads = 16;
// Initialize instance data for each thread
std::vector<SVD_INSTANCE> vecSVD(iNumThreads);
for (int i = 0; i < iNumThreads; ++i)
{
auto& SVD = vecSVD[i];
SVD.id = i;
auto result = cudaStreamCreateWithFlags(&SVD.stream, cudaStreamNonBlocking);
auto status = cusolverDnCreate(&SVD.d_cusolverHandle);
status = cusolverDnSetStream(SVD.d_cusolverHandle, SVD.stream);
auto cr = cudaMallocHost((void**)&SVD.h_pfcSrc, sizeof(cuComplex) * XRES * XRES);
cr = cudaMallocHost((void**)&SVD.h_pfcU, sizeof(cuComplex) * XRES * XRES);
cr = cudaMalloc(reinterpret_cast<void**>(&SVD.d_fcZ), sizeof(cuComplex) * XRES * XRES);
cr = cudaMalloc(reinterpret_cast<void**>(&SVD.d_fSvdS), sizeof(float) * XRES);
cr = cudaMalloc(reinterpret_cast<void**>(&SVD.d_fcSvdU), sizeof(cuComplex) * XRES * XRES);
cr = cudaMalloc(reinterpret_cast<void**>(&SVD.d_fcV), sizeof(cuComplex) * XRES * XRES);
cr = cudaMalloc(reinterpret_cast<void**>(&SVD.d_svdInfo), sizeof(int));
const int m = XRES;
const int n = XRES;
status = cusolverDnDgesvd_bufferSize(SVD.d_cusolverHandle, m, n, &SVD.lwork);
cr = cudaMalloc(reinterpret_cast<void**>(&SVD.d_svdWork), sizeof(cuComplex) * SVD.lwork);
cr = cudaMalloc(reinterpret_cast<void**>(&SVD.d_svdRWork), sizeof(float) * std::min(m, n));
}
auto svdTask = [&](int iThread)
{
auto& SVD = vecSVD[iThread];
auto cr = cudaMemcpyAsync(SVD.d_fcZ, SVD.h_pfcSrc, XRES * XRES * sizeof(cuComplex), cudaMemcpyHostToDevice, SVD.stream);
// matrix is square so is easy
const int m = XRES;
const int n = XRES;
const int lda = m;
const int ldu = m;
const int ldv = n;
constexpr signed char jobu = 'A'; // all m columns of U
constexpr signed char jobvt = 'N'; // no rows of VT
auto status = cusolverDnCgesvd(
SVD.d_cusolverHandle, jobu, jobvt,
m,
n,
SVD.d_fcZ,
lda,
SVD.d_fSvdS,
SVD.d_fcSvdU,
ldu,
SVD.d_fcV,
ldv,
SVD.d_svdWork, SVD.lwork, SVD.d_svdRWork, SVD.d_svdInfo);
cr = cudaMemcpyAsync(SVD.h_pfcU, SVD.d_fcSvdU, XRES * XRES * sizeof(cuComplex), cudaMemcpyDeviceToHost, SVD.stream);
};
constexpr int iNumTests = 3;
for (int iTest=0; iTest< iNumTests; ++iTest)
{
auto start = std::chrono::high_resolution_clock::now();
// Run as threads
{
std::vector<std::jthread> vecThreads;
for (int i = 0; i < iNumThreads; ++i)
{
vecThreads.emplace_back(svdTask, i);
}
// Syncronize
for (int i = 0; i < iNumThreads; ++i)
{
cudaStreamSynchronize(vecSVD[i].stream);
}
}
auto end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double, std::milli> duration = end - start;
std::cout << "Time taken threaded: " << duration.count() << "ms" << std::endl;
}
// Free
for (int i = 0; i < iNumThreads; ++i)
{
auto& SVD = vecSVD[i];
cudaFreeHost(SVD.h_pfcSrc);
cudaFreeHost(SVD.h_pfcU);
cudaFree(SVD.d_fcZ);
cudaFree(SVD.d_fSvdS);
cudaFree(SVD.d_fcSvdU);
cudaFree(SVD.d_fcV);
cudaFree(SVD.d_svdInfo);
cusolverDnDestroy(SVD.d_cusolverHandle);
cudaStreamDestroy(SVD.stream);
}
cudaDeviceReset();
}