My program cannot run on GeForce RTX 3080 and 3070

My program cannot run on GeForce RTX 3080 and 3070 ,it will hang ,but it can work correctly on GeForce RTX RTX 2080 and 2070 ,here is the test code.Can anyone help me ?

static bool ReadBin(const void* pData, int64_t iLength, std::string sFileName)
{
	std::fstream of(sFileName.c_str(), std::ios::in | std::ios::binary);
	if (!of.is_open())
		return false;
	of.read((char*)pData, iLength);
	of.close();
	return true;
}

struct stGpuHandles
{
	stGpuHandles()
	{
	}

	stGpuHandles(int index, cudaStream_t stream, cusolverDnHandle_t solver) : iIndex(index), hStream(stream), hCusolver(solver)
	{
	}

	int iIndex;
	cusolverDnHandle_t hCusolver;
	cudaStream_t hStream;
};

typedef CFloat MKL_Complex8;

int testCudaMthSvd()
{
	cout << "testcudasvd in\n";
	const int im = 19832;
	const int in = 676;
	int ret = 0;
	shared_ptr<CFloat> spImgRaw(new CFloat[im * in]);
	if (!ReadBin(spImgRaw.get(), sizeof(CFloat) * im * in, "msepiDump_pIm2RowA.bin"))
	{
		cerr << " Read img data file msepiDump_pIm2RowA failed\n";
		return -1;
	}
	int iTasks = std::getenv("SVD_TASK_NUM") ? atoi(std::getenv("SVD_TASK_NUM")) : 40000;
	int iThreadNum = std::getenv("SVD_THREAD_NUM") ? atoi(std::getenv("SVD_THREAD_NUM")) : 20;
	int iHandles = std::getenv("SVD_HANDLE_NUM") ? atoi(std::getenv("SVD_HANDLE_NUM")) : 20;
	bool bDevide = std::getenv("SVD_DEVIDE") != 0;
	bool bBindThread = std::getenv("SVD_NOT_BIND") == 0;
	cout << "Param:" << "\n"
		<< setw(20) << "task:" << setw(10) << iTasks << "\n"
		<< setw(20) << "thread:" << setw(10) << iThreadNum << "\n"
		<< setw(20) << "handle:" << setw(10) << iHandles << "\n"
		<< setw(20) << "devide:" << setw(10) << bDevide << "\n"
		<< setw(20) << "bind:" << setw(10) << bBindThread << "\n";

	int iDevCount = 0;
	if ((ret = cudaGetDeviceCount(&iDevCount) != cudaSuccess))
	{
		cerr << "no gpu available\n";
	}
	map<int, stGpuHandles> mpThreadHandles;
	deque<stGpuHandles> handles;
	if (!bBindThread)
	{
		for (int i = 0; i < iHandles; ++i)
		{
			cusolverDnHandle_t handleCusolver;
			cudaStream_t hStream = 0;
			CHECKED_CALL_CUDA(cudaStreamCreate(&hStream));
			CHECKED_CALL_CUSOLVER(cusolverDnCreate(&handleCusolver));
			handles.push_back(stGpuHandles(i, hStream, handleCusolver));
		}
	}
	auto t0 = boost::posix_time::microsec_clock::local_time();
	boost::mutex mtxOut;
	boost::condition_variable cv;
	vector<boost::thread> vThreads(iThreadNum);
	for (int i = 0; i < iThreadNum; ++i)
	{
		vThreads[i] = boost::thread([=, &iTasks, &mtxOut, &cv, &handles, &mpThreadHandles](int ith)-> int
		{
			int iCurTask;
			while ((iCurTask = --iTasks) >= 0)
			{
				stGpuHandles handle;
				{
					boost::mutex::scoped_lock lc(mtxOut);
					if (bBindThread)
					{
						if (mpThreadHandles.find(ith) == mpThreadHandles.end())
						{
							cusolverDnHandle_t handleCusolver;
							cudaStream_t hStream = 0;
							CHECKED_CALL_CUDA(cudaStreamCreate(&hStream));
							CHECKED_CALL_CUSOLVER(cusolverDnCreate(&handleCusolver));
							mpThreadHandles[ith] = stGpuHandles(ith, hStream, handleCusolver);
						}
						handle = mpThreadHandles[ith];
					}
					else
					{
						while (handles.size() <= 0)
						{
							cv.wait(lc);
						}
						handle = *handles.begin();
						handles.pop_front();
					}
				}
				cudaStream_t hStream = handle.hStream;
				cusolverDnHandle_t hCusolver = handle.hCusolver;
				CHECKED_CALL_CUSOLVER(cusolverDnSetStream(hCusolver, hStream));
				shared_ptr<float> spS(new float[in]);
				shared_ptr<MKL_Complex8> spVt(new MKL_Complex8[in * in]);
				shared_ptr<MKL_Complex8> spU(new MKL_Complex8[im * in]);
				cuComplex* pdA = NULL;
				cuComplex* pdU = NULL;
				cuComplex* pdVt = NULL;
				cuComplex* pdWork = NULL;
				int iWork = 0;
				float* pdS = NULL;
				float* pdRwork = NULL;
				int* pdDevInfo = NULL;
				int devInfo = 0;
				gesvdjInfo_t gesvdj_params;

				if (bDevide)
				{
					CHECKED_CALL_CUSOLVER(cusolverDnCgesvd_bufferSize(hCusolver, im, in, &iWork));
				}
				else
				{
					CHECKED_CALL_CUSOLVER(cusolverDnCreateGesvdjInfo(&gesvdj_params));
					CHECKED_CALL_CUSOLVER(cusolverDnCgesvdj_bufferSize(hCusolver, CUSOLVER_EIG_MODE_VECTOR, 1, im, in, pdA, im, pdS, pdU, im, pdVt, in, &iWork,
 gesvdj_params));
				}
				CHECKED_CALL_CUDA(cudaMalloc(&pdA, sizeof(cuComplex)* im * in));
				CHECKED_CALL_CUDA(cudaMalloc(&pdU, sizeof(cuComplex)* im * in));
				CHECKED_CALL_CUDA(cudaMalloc(&pdVt, sizeof(cuComplex)* in * in));
				CHECKED_CALL_CUDA(cudaMalloc(&pdWork, sizeof(cuComplex)* iWork));
				CHECKED_CALL_CUDA(cudaMalloc(&pdS, sizeof(float)* in));
				CHECKED_CALL_CUDA(cudaMalloc(&pdRwork, sizeof(float)* std::min(im, in)));
				CHECKED_CALL_CUDA(cudaMalloc(&pdDevInfo, sizeof(int)));
				shared_ptr<void> pFake(0, [&](void*)
				{
					cudaFree(pdA);
					cudaFree(pdU);
					cudaFree(pdVt);
					cudaFree(pdWork);
					cudaFree(pdS);
					cudaFree(pdRwork);
					cudaFree(pdDevInfo);
					cusolverDnDestroyGesvdjInfo(gesvdj_params);
				});
				shared_ptr<CFloat> spImg(new CFloat[im * in]);
				memcpy(spImg.get(), spImgRaw.get(), sizeof(im * in * sizeof(CFloat)));
				auto t0 = boost::posix_time::microsec_clock::local_time();
				cudaMemcpyAsync(pdA, spImg.get(), sizeof(CFloat) * im * in, cudaMemcpyHostToDevice, hStream);
				if (bDevide)
				{
					CHECKED_CALL_CUSOLVER(cusolverDnCgesvd(hCusolver, 'S', 'S', im, in, pdA, im, pdS, pdU, im, pdVt, in, pdWork, iWork, pdRwork, pdDevInfo));
				}
				else
				{
					CHECKED_CALL_CUSOLVER(cusolverDnCgesvdj(hCusolver, CUSOLVER_EIG_MODE_VECTOR, 1, im, in, pdA, im, pdS, pdU, im, pdVt, in, pdWork, iWork,
 pdDevInfo, gesvdj_params));
				}
				CHECKED_CALL_CUDA(cudaMemcpyAsync(&devInfo, pdDevInfo, sizeof(devInfo), cudaMemcpyDeviceToHost, hStream));
				CHECKED_CALL_CUDA(cudaMemcpyAsync(spU.get(), pdU, sizeof(cuComplex)*im* in, cudaMemcpyDeviceToHost, hStream));
				CHECKED_CALL_CUDA(cudaMemcpyAsync(spS.get(), pdS, sizeof(float)* in, cudaMemcpyDeviceToHost, hStream));
				CHECKED_CALL_CUDA(cudaMemcpyAsync(spVt.get(), pdVt, sizeof(cuComplex)* in * in, cudaMemcpyDeviceToHost, hStream));
				cudaStreamSynchronize(hStream);
				if (devInfo < 0)
				{
					cerr << "param:" << devInfo << "error\n";
				}
				{
					boost::mutex::scoped_lock lc(mtxOut);
					uint64_t iGpuMemAlloc = sizeof(cuComplex) * im * in
						+ sizeof(cuComplex) * im * in
						+ sizeof(cuComplex) * in * in
						+ sizeof(cuComplex) * iWork
						+ sizeof(int)
						+ sizeof(float) * in
						+ sizeof(float) * std::min(im, in);
					if (!bBindThread)
					{
						handles.push_back(handle);
					}
					cv.notify_all();
					cout << "Thread:" << setw(2) << ith
						<< " use handle:" << setw(2) << handle.iIndex
						<< " ,run task:" << setw(2) << iCurTask
						<< " ,got value:" << setw(5) << *(spS.get())
						<< " ,time:" << (boost::posix_time::microsec_clock::local_time() - t0).total_milliseconds()
						<< " ,GpuMem:" << int(iGpuMemAlloc / (1024.f * 1024)) << "M,\n";
				}
			}
		}, i);
	}
	for (int i = 0; i < vThreads.size(); ++i)
	{
		vThreads[i].join();
	}
	cout << "all thread done, time:" << (boost::posix_time::microsec_clock::local_time() - t0).total_milliseconds() << "\n";
	cout << "testcudasvd out\n";
	return 0;
}

int main()
{
	return testCudaMthSvd();
}

My program cannot run on GeForce RTX 3080 and 3070 ,it will hang

You might want to mention where in the code the program hangs. You might also want to mention how you compile the code (i.e. complete nvcc commandline used to build the binary). Also worth mentioning: the CUDA version that is being used here.

The test code requires 400000 cycles ,you can think of it as a stability test.it will get hang after several or dozens of times cycles , you can not know which cycle it will hang .My nvcc commandline : nvcc …/nv_svd.cpp -l:libboost_system.so.1.60.0 -l:libboost_date_time.so.1.60.0 -l:libboost_thread.so.1.60.0 -lcudart -lcublas -lcusolver -L. -I…/include -std=c++11 -Xcompiler -fopenmp -O2 -arch=sm_86(for RTX 2080 it is sm_75) -o nv_svd

I see I expressed myself poorly. Let me try again:

At which line of the program code does the hang occur?