My program cannot run on GeForce RTX 3080 and 3070 ,it will hang ,but it can work correctly on GeForce RTX RTX 2080 and 2070 ,here is the test code.Can anyone help me ?
static bool ReadBin(const void* pData, int64_t iLength, std::string sFileName)
{
std::fstream of(sFileName.c_str(), std::ios::in | std::ios::binary);
if (!of.is_open())
return false;
of.read((char*)pData, iLength);
of.close();
return true;
}
struct stGpuHandles
{
stGpuHandles()
{
}
stGpuHandles(int index, cudaStream_t stream, cusolverDnHandle_t solver) : iIndex(index), hStream(stream), hCusolver(solver)
{
}
int iIndex;
cusolverDnHandle_t hCusolver;
cudaStream_t hStream;
};
typedef CFloat MKL_Complex8;
int testCudaMthSvd()
{
cout << "testcudasvd in\n";
const int im = 19832;
const int in = 676;
int ret = 0;
shared_ptr<CFloat> spImgRaw(new CFloat[im * in]);
if (!ReadBin(spImgRaw.get(), sizeof(CFloat) * im * in, "msepiDump_pIm2RowA.bin"))
{
cerr << " Read img data file msepiDump_pIm2RowA failed\n";
return -1;
}
int iTasks = std::getenv("SVD_TASK_NUM") ? atoi(std::getenv("SVD_TASK_NUM")) : 40000;
int iThreadNum = std::getenv("SVD_THREAD_NUM") ? atoi(std::getenv("SVD_THREAD_NUM")) : 20;
int iHandles = std::getenv("SVD_HANDLE_NUM") ? atoi(std::getenv("SVD_HANDLE_NUM")) : 20;
bool bDevide = std::getenv("SVD_DEVIDE") != 0;
bool bBindThread = std::getenv("SVD_NOT_BIND") == 0;
cout << "Param:" << "\n"
<< setw(20) << "task:" << setw(10) << iTasks << "\n"
<< setw(20) << "thread:" << setw(10) << iThreadNum << "\n"
<< setw(20) << "handle:" << setw(10) << iHandles << "\n"
<< setw(20) << "devide:" << setw(10) << bDevide << "\n"
<< setw(20) << "bind:" << setw(10) << bBindThread << "\n";
int iDevCount = 0;
if ((ret = cudaGetDeviceCount(&iDevCount) != cudaSuccess))
{
cerr << "no gpu available\n";
}
map<int, stGpuHandles> mpThreadHandles;
deque<stGpuHandles> handles;
if (!bBindThread)
{
for (int i = 0; i < iHandles; ++i)
{
cusolverDnHandle_t handleCusolver;
cudaStream_t hStream = 0;
CHECKED_CALL_CUDA(cudaStreamCreate(&hStream));
CHECKED_CALL_CUSOLVER(cusolverDnCreate(&handleCusolver));
handles.push_back(stGpuHandles(i, hStream, handleCusolver));
}
}
auto t0 = boost::posix_time::microsec_clock::local_time();
boost::mutex mtxOut;
boost::condition_variable cv;
vector<boost::thread> vThreads(iThreadNum);
for (int i = 0; i < iThreadNum; ++i)
{
vThreads[i] = boost::thread([=, &iTasks, &mtxOut, &cv, &handles, &mpThreadHandles](int ith)-> int
{
int iCurTask;
while ((iCurTask = --iTasks) >= 0)
{
stGpuHandles handle;
{
boost::mutex::scoped_lock lc(mtxOut);
if (bBindThread)
{
if (mpThreadHandles.find(ith) == mpThreadHandles.end())
{
cusolverDnHandle_t handleCusolver;
cudaStream_t hStream = 0;
CHECKED_CALL_CUDA(cudaStreamCreate(&hStream));
CHECKED_CALL_CUSOLVER(cusolverDnCreate(&handleCusolver));
mpThreadHandles[ith] = stGpuHandles(ith, hStream, handleCusolver);
}
handle = mpThreadHandles[ith];
}
else
{
while (handles.size() <= 0)
{
cv.wait(lc);
}
handle = *handles.begin();
handles.pop_front();
}
}
cudaStream_t hStream = handle.hStream;
cusolverDnHandle_t hCusolver = handle.hCusolver;
CHECKED_CALL_CUSOLVER(cusolverDnSetStream(hCusolver, hStream));
shared_ptr<float> spS(new float[in]);
shared_ptr<MKL_Complex8> spVt(new MKL_Complex8[in * in]);
shared_ptr<MKL_Complex8> spU(new MKL_Complex8[im * in]);
cuComplex* pdA = NULL;
cuComplex* pdU = NULL;
cuComplex* pdVt = NULL;
cuComplex* pdWork = NULL;
int iWork = 0;
float* pdS = NULL;
float* pdRwork = NULL;
int* pdDevInfo = NULL;
int devInfo = 0;
gesvdjInfo_t gesvdj_params;
if (bDevide)
{
CHECKED_CALL_CUSOLVER(cusolverDnCgesvd_bufferSize(hCusolver, im, in, &iWork));
}
else
{
CHECKED_CALL_CUSOLVER(cusolverDnCreateGesvdjInfo(&gesvdj_params));
CHECKED_CALL_CUSOLVER(cusolverDnCgesvdj_bufferSize(hCusolver, CUSOLVER_EIG_MODE_VECTOR, 1, im, in, pdA, im, pdS, pdU, im, pdVt, in, &iWork,
gesvdj_params));
}
CHECKED_CALL_CUDA(cudaMalloc(&pdA, sizeof(cuComplex)* im * in));
CHECKED_CALL_CUDA(cudaMalloc(&pdU, sizeof(cuComplex)* im * in));
CHECKED_CALL_CUDA(cudaMalloc(&pdVt, sizeof(cuComplex)* in * in));
CHECKED_CALL_CUDA(cudaMalloc(&pdWork, sizeof(cuComplex)* iWork));
CHECKED_CALL_CUDA(cudaMalloc(&pdS, sizeof(float)* in));
CHECKED_CALL_CUDA(cudaMalloc(&pdRwork, sizeof(float)* std::min(im, in)));
CHECKED_CALL_CUDA(cudaMalloc(&pdDevInfo, sizeof(int)));
shared_ptr<void> pFake(0, [&](void*)
{
cudaFree(pdA);
cudaFree(pdU);
cudaFree(pdVt);
cudaFree(pdWork);
cudaFree(pdS);
cudaFree(pdRwork);
cudaFree(pdDevInfo);
cusolverDnDestroyGesvdjInfo(gesvdj_params);
});
shared_ptr<CFloat> spImg(new CFloat[im * in]);
memcpy(spImg.get(), spImgRaw.get(), sizeof(im * in * sizeof(CFloat)));
auto t0 = boost::posix_time::microsec_clock::local_time();
cudaMemcpyAsync(pdA, spImg.get(), sizeof(CFloat) * im * in, cudaMemcpyHostToDevice, hStream);
if (bDevide)
{
CHECKED_CALL_CUSOLVER(cusolverDnCgesvd(hCusolver, 'S', 'S', im, in, pdA, im, pdS, pdU, im, pdVt, in, pdWork, iWork, pdRwork, pdDevInfo));
}
else
{
CHECKED_CALL_CUSOLVER(cusolverDnCgesvdj(hCusolver, CUSOLVER_EIG_MODE_VECTOR, 1, im, in, pdA, im, pdS, pdU, im, pdVt, in, pdWork, iWork,
pdDevInfo, gesvdj_params));
}
CHECKED_CALL_CUDA(cudaMemcpyAsync(&devInfo, pdDevInfo, sizeof(devInfo), cudaMemcpyDeviceToHost, hStream));
CHECKED_CALL_CUDA(cudaMemcpyAsync(spU.get(), pdU, sizeof(cuComplex)*im* in, cudaMemcpyDeviceToHost, hStream));
CHECKED_CALL_CUDA(cudaMemcpyAsync(spS.get(), pdS, sizeof(float)* in, cudaMemcpyDeviceToHost, hStream));
CHECKED_CALL_CUDA(cudaMemcpyAsync(spVt.get(), pdVt, sizeof(cuComplex)* in * in, cudaMemcpyDeviceToHost, hStream));
cudaStreamSynchronize(hStream);
if (devInfo < 0)
{
cerr << "param:" << devInfo << "error\n";
}
{
boost::mutex::scoped_lock lc(mtxOut);
uint64_t iGpuMemAlloc = sizeof(cuComplex) * im * in
+ sizeof(cuComplex) * im * in
+ sizeof(cuComplex) * in * in
+ sizeof(cuComplex) * iWork
+ sizeof(int)
+ sizeof(float) * in
+ sizeof(float) * std::min(im, in);
if (!bBindThread)
{
handles.push_back(handle);
}
cv.notify_all();
cout << "Thread:" << setw(2) << ith
<< " use handle:" << setw(2) << handle.iIndex
<< " ,run task:" << setw(2) << iCurTask
<< " ,got value:" << setw(5) << *(spS.get())
<< " ,time:" << (boost::posix_time::microsec_clock::local_time() - t0).total_milliseconds()
<< " ,GpuMem:" << int(iGpuMemAlloc / (1024.f * 1024)) << "M,\n";
}
}
}, i);
}
for (int i = 0; i < vThreads.size(); ++i)
{
vThreads[i].join();
}
cout << "all thread done, time:" << (boost::posix_time::microsec_clock::local_time() - t0).total_milliseconds() << "\n";
cout << "testcudasvd out\n";
return 0;
}
int main()
{
return testCudaMthSvd();
}