RTX 5070ti laptop.
nvcc -V
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2025 NVIDIA Corporation
Built on Fri_Feb_21_20:42:46_Pacific_Standard_Time_2025
Cuda compilation tools, release 12.8, V12.8.93
Build cuda_12.8.r12.8/compiler.35583870_0
/cuda_sort_project
│-- CMakeLists.txt
│-- main.cu
CMakeLists.txt:
cmake_minimum_required(VERSION 3.12)
project(SimpleCudaProject CUDA)
set(CMAKE_CUDA_STANDARD 17)
add_executable(SimpleCudaProject main.cu)
set_target_properties(SimpleCudaProject PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
main.cu:
#include <vector>
#include <cub/cub.cuh>
int main() {
const int n = 10000; // 1000 is ok
size_t size = n * sizeof(float);
std::vector<float> h_data(n);
for (int i = 0; i < n; ++i) {
h_data[i] = static_cast<float>(rand()) / RAND_MAX;
}
std::cout << "Unsorted data: ";
for (int i = 0; i < n; ++i) {
//std::cout << h_data[i] << " ";
}
std::cout << std::endl;
float* d_data;
cudaMalloc(&d_data, n * sizeof(float));
cudaMemcpy(d_data, h_data.data(), n * sizeof(float), cudaMemcpyHostToDevice);
cudaDeviceSynchronize();
float* d_sorted;
cudaMalloc(&d_sorted, n * sizeof(float));
void* d_temp_storage = nullptr;
size_t temp_storage_bytes = 0;
cub::DeviceRadixSort::SortKeys(
d_temp_storage, temp_storage_bytes,
d_data, d_sorted, n
);
cudaMalloc(&d_temp_storage, temp_storage_bytes);
/*
CUDA error 1 [C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.8\include\cub/device/dispatch/
dispatch_radix_sort.cuh, 2287]: invalid argument
CUDA error 1 [C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.8\include\cub/device/dispatch/
dispatch_radix_sort.cuh, 2706]: invalid argument
*/
cub::DeviceRadixSort::SortKeys(
d_temp_storage, temp_storage_bytes,
d_data, d_sorted, n
);
cudaFree(d_temp_storage);
std::vector<float> h_sorted(n);
cudaMemcpy(h_sorted.data(), d_sorted, n * sizeof(float), cudaMemcpyDeviceToHost);
std::cout << "Sorted data: ";
for (int i = 0; i < n; ++i) {
//std::cout << h_sorted[i] << " ";
}
std::cout << std::endl;
cudaFree(d_sorted);
cudaFree(d_data);
return 0;
}
Why n = 1000 works ok but n = 10000 gives error:
CUDA error 1 [C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.8\include\cub/device/dispatch/dispatch_radix_sort.cuh, 2287]: invalid argument
CUDA error 1 [C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.8\include\cub/device/dispatch/dispatch_radix_sort.cuh, 2706]: invalid argument
I found that using CUDA_SEPARABLE_COMPILATION gives this error. But i don’t want to disable this flag. How to fix?