I’m using cuda 10.2 for a project(sm_61, compute_61). Here’s a part of my code.

```
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cstdio>
#include <cstdlib>
constexpr unsigned int CUDA_MAX_THREAD_NUM = 256;
#define Block_Count(tot_thrd) (((tot_thrd) + CUDA_MAX_THREAD_NUM - 1u) / CUDA_MAX_THREAD_NUM)
static void HandleError(cudaError_t err, const char *file, int line) {
if (err != cudaSuccess) {
fprintf(stderr, "Error %d: \"%s\" in %s at line %d\n", int(err), cudaGetErrorString(err), file, line);
exit(3);
}
}
#define HANDLE_ERROR(err) (HandleError(err, __FILE__, __LINE__))
template <typename T>
__global__ void subConstructArray(T* dsts, size_t Len) {
size_t Idi = blockIdx.x * CUDA_MAX_THREAD_NUM + threadIdx.x;
if (Idi >= Len) return;
new (dsts + Idi)T;
}
template <typename T>
__global__ void subDestructArray(T* dsts, size_t Len) {
size_t Idi = blockIdx.x * CUDA_MAX_THREAD_NUM + threadIdx.x;
if (Idi >= Len) return;
(dsts + Idi)->~T();
}
template <typename T>
void ConstructArray(T*& dsts, size_t Len) {
HANDLE_ERROR(cudaMalloc(&dsts, sizeof(T) * Len));
subConstructArray<<<Block_Count(Len), CUDA_MAX_THREAD_NUM>>>
(dsts, Len);
cudaDeviceSynchronize();
HANDLE_ERROR(cudaGetLastError());
}
template <typename T>
void DestructArray(T* dsts, size_t Len) {
subDestructArray<<<Block_Count(Len), CUDA_MAX_THREAD_NUM>>>
(dsts, Len);
cudaDeviceSynchronize();
HANDLE_ERROR(cudaGetLastError()); //cudaErrorLaunchOutOfResources Here
HANDLE_ERROR(cudaFree(dsts));
}
template<typename T>
class tensor {
private:
T* _elems;
int _Ply;
int _Height;
int _Width;
public:
__device__ tensor() : _elems(nullptr), _Ply(0), _Height(0), _Width(0) {}
__device__ ~tensor() { free(_elems); }
__device__ void Set_size(int _ply, int _hgt, int _wid) {
if (_ply * _hgt * _wid != _Ply * _Height * _Width) {
free(_elems);
_elems = (T*)malloc(sizeof(T) * _ply * _hgt * _wid);
}
_Ply = _ply;
_Height = _hgt;
_Width = _wid;
}
};
size_t Tot_Thrd = 256;
tensor<double>* out;
tensor<double>* er;
void Set_Tot_Thrd(size_t _num_of_thrds) {
DestructArray(out, Tot_Thrd);
DestructArray(er, Tot_Thrd);
ConstructArray(out, _num_of_thrds);
ConstructArray(er, _num_of_thrds);
Tot_Thrd = _num_of_thrds;
}
int main() {
ConstructArray(out, Tot_Thrd);
ConstructArray(er, Tot_Thrd);
Set_Tot_Thrd(16);
}
```

But when I call function “Set_Tot_Thrd(8)”, HANDLE_ERROR(cudaGetLastError()) in host function DestructArray() turns out to be cudaErrorLaunchOutOfResources (701) - “too many resources requested for launch”. I used to think that the number of registers the function “**global** void subDestructArray” need is larger than the number of registers available per block. So I ran deviceQuery:

And here’s ptxas info:

256*17= 4352 < 65536. So the number of registers did not exceed the max limit.

Also I added -maxrregcount=16 but after compiling the same error “cudaErrorLaunchOutOfResources” occurred when running the program. Neither cuda-memcheck nor ptxas gave me helpful answer.

It seems like that cudaErrorLaunchOutOfResources means the number of not only registers but something else exceeded. But I have no idea what runs out when I call the function.

I’m desperate to solve this problem. What should I do? Many thanks in advance.