Why win11 CUDA sort array using cub library fails?

RTX 5070ti laptop.
nvcc -V
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2025 NVIDIA Corporation
Built on Fri_Feb_21_20:42:46_Pacific_Standard_Time_2025
Cuda compilation tools, release 12.8, V12.8.93
Build cuda_12.8.r12.8/compiler.35583870_0

/cuda_sort_project
│-- CMakeLists.txt
│-- main.cu

CMakeLists.txt:

cmake_minimum_required(VERSION 3.12)
project(SimpleCudaProject CUDA)
set(CMAKE_CUDA_STANDARD 17)
add_executable(SimpleCudaProject main.cu)
set_target_properties(SimpleCudaProject PROPERTIES CUDA_SEPARABLE_COMPILATION ON)

main.cu:

#include <vector>
#include <cub/cub.cuh>

int main() {
    const int n = 10000; // 1000 is ok
    size_t size = n * sizeof(float);
    std::vector<float> h_data(n);
    
    for (int i = 0; i < n; ++i) {
        h_data[i] = static_cast<float>(rand()) / RAND_MAX;
    }
    std::cout << "Unsorted data: ";
    for (int i = 0; i < n; ++i) {
    	//std::cout << h_data[i] << " ";
    }
    std::cout << std::endl;
    float* d_data;
    cudaMalloc(&d_data, n * sizeof(float));
    cudaMemcpy(d_data, h_data.data(), n * sizeof(float), cudaMemcpyHostToDevice);
    cudaDeviceSynchronize();
    
    float* d_sorted;
    cudaMalloc(&d_sorted, n * sizeof(float));
    void* d_temp_storage = nullptr;
    size_t temp_storage_bytes = 0;
    
    cub::DeviceRadixSort::SortKeys(
        d_temp_storage, temp_storage_bytes,
        d_data, d_sorted, n
    );
    cudaMalloc(&d_temp_storage, temp_storage_bytes);
    /*
      CUDA error 1 [C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.8\include\cub/device/dispatch/
      dispatch_radix_sort.cuh, 2287]: invalid argument
      CUDA error 1 [C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.8\include\cub/device/dispatch/
      dispatch_radix_sort.cuh, 2706]: invalid argument
     */
    cub::DeviceRadixSort::SortKeys(
        d_temp_storage, temp_storage_bytes,
        d_data, d_sorted, n
    );
    cudaFree(d_temp_storage);
    
    std::vector<float> h_sorted(n);
    cudaMemcpy(h_sorted.data(), d_sorted, n * sizeof(float), cudaMemcpyDeviceToHost);
    std::cout << "Sorted data: ";
    for (int i = 0; i < n; ++i) {
    	//std::cout << h_sorted[i] << " ";
    }
    std::cout << std::endl;
    cudaFree(d_sorted);
    cudaFree(d_data);
    
    return 0;
}

Why n = 1000 works ok but n = 10000 gives error:
CUDA error 1 [C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.8\include\cub/device/dispatch/dispatch_radix_sort.cuh, 2287]: invalid argument
CUDA error 1 [C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.8\include\cub/device/dispatch/dispatch_radix_sort.cuh, 2706]: invalid argument

I found that using CUDA_SEPARABLE_COMPILATION gives this error. But i don’t want to disable this flag. How to fix?

Not really clear:

Anyway, with CUDA 12.8.1 I was not able to reproduce the error, either when n = 1000 or when n = 10000, although I don’t have a 5070ti to test on. So I’m not really sure what you are seeing. I tried both with and without -rdc=true

I try with RTX 4090, the error is the same. n = 1000 works but n = 10000 fails.