OpenCV Cuda runtime issue

I built OpenCV 3.4.20-dev with Cuda 11.8 and opencv extra modules(contrib). I used CMake and everything is good. I can use the library, but I cannot use cuda::HOG. It gives me the error below,

OpenCV(3.4.20-dev) Error: Gpu API call (invalid device symbol) in cv::cuda::device::hog::set_up_constants, file C:\Users\Hiran\Documents\GitHub\opencv\modules\cudaobjdetect\src\cuda\hog.cu, line 107
OpenCV(3.4.20-dev) C:\Users\Hiran\Documents\GitHub\opencv\modules\cudaobjdetect\src\cuda\hog.cu:107: error: (-217:Gpu API call) invalid device symbol in function 'cv::cuda::device::hog::set_up_constants'

I used CUDA_ARCH_BIN value 8.6 as my GPU is RTX 3070. I tried building with Cuda 11.6 but the same error comes.

This is the code it complains about from hog.cu,

void set_up_constants(int nbins,
                              int block_stride_x, int block_stride_y,
                              int nblocks_win_x, int nblocks_win_y,
                              int ncells_block_x, int ncells_block_y,
                              const cudaStream_t& stream)
        {
            cudaSafeCall(cudaMemcpyToSymbolAsync(cnbins,               &nbins,               sizeof(nbins),               0, cudaMemcpyHostToDevice, stream));
            cudaSafeCall(cudaMemcpyToSymbolAsync(cblock_stride_x,      &block_stride_x,      sizeof(block_stride_x),      0, cudaMemcpyHostToDevice, stream));
            cudaSafeCall(cudaMemcpyToSymbolAsync(cblock_stride_y,      &block_stride_y,      sizeof(block_stride_y),      0, cudaMemcpyHostToDevice, stream));
            cudaSafeCall(cudaMemcpyToSymbolAsync(cnblocks_win_x,       &nblocks_win_x,       sizeof(nblocks_win_x),       0, cudaMemcpyHostToDevice, stream));
            cudaSafeCall(cudaMemcpyToSymbolAsync(cnblocks_win_y,       &nblocks_win_y,       sizeof(nblocks_win_y),       0, cudaMemcpyHostToDevice, stream));
            cudaSafeCall(cudaMemcpyToSymbolAsync(cncells_block_x,      &ncells_block_x,      sizeof(ncells_block_x),      0, cudaMemcpyHostToDevice, stream));
            cudaSafeCall(cudaMemcpyToSymbolAsync(cncells_block_y,      &ncells_block_y,      sizeof(ncells_block_y),      0, cudaMemcpyHostToDevice, stream));

            int block_hist_size = nbins * ncells_block_x * ncells_block_y;
            cudaSafeCall(cudaMemcpyToSymbolAsync(cblock_hist_size,     &block_hist_size,     sizeof(block_hist_size),     0, cudaMemcpyHostToDevice, stream));

            int block_hist_size_2up = power_2up(block_hist_size);
            cudaSafeCall(cudaMemcpyToSymbolAsync(cblock_hist_size_2up, &block_hist_size_2up, sizeof(block_hist_size_2up), 0, cudaMemcpyHostToDevice, stream));

            int descr_width = nblocks_win_x * block_hist_size;
            cudaSafeCall(cudaMemcpyToSymbolAsync(cdescr_width,         &descr_width,         sizeof(descr_width),         0, cudaMemcpyHostToDevice, stream));

            int descr_size = descr_width * nblocks_win_y;
            cudaSafeCall(cudaMemcpyToSymbolAsync(cdescr_size,          &descr_size,          sizeof(descr_size),          0, cudaMemcpyHostToDevice, stream));
        }