Stdpar -- Floating point exception

If I compile the benchmark below (from Using C++17 Parallel Algorithms for Better Performance - C++ Team Blog)
with nvc++ -stdpar main.cpp -o main I get a floating point exception (running through gdb see error and backtrace below).
If I compile with debug symbols ( nvc++ -stdpar -g main.cpp -o main) I don’t get the exception and it runs. I have gcc 10.1.0, cuda 10.2.89, kernel 5.7.4, 2080supers.

#include <stddef.h>
#include <stdio.h>
#include
#include
#include
#include
#include
#include

using std::chrono::duration;
using std::chrono::duration_cast;
using std::chrono::high_resolution_clock;
using std::milli;
using std::random_device;
using std::sort;
using std::vector;

const size_t testSize = 1’000’000;
const int iterationCount = 20;

void print_results(const char *const tag, const vector& sorted,
high_resolution_clock::time_point startTime,
high_resolution_clock::time_point endTime) {
printf(“%s: Lowest: %g Highest: %g Time: %fms\n”, tag, sorted.front(),
sorted.back(),
duration_cast<duration<double, milli>>(endTime - startTime).count());
}

int main() {
random_device rd;

// generate some random doubles:
printf("Testing with %zu doubles...\n", testSize);
vector<double> doubles(testSize);
for (auto& d : doubles) {
    d = static_cast<double>(rd());
}

// time how long it takes to sort them:
for (int i = 0; i < iterationCount; ++i)
{
    vector<double> sorted(doubles);
    const auto startTime = high_resolution_clock::now();
    sort(std::execution::par,sorted.begin(), sorted.end());
    const auto endTime = high_resolution_clock::now();
    print_results("Serial", sorted, startTime, endTime);
}

}

Exception:
Thread 1 “main” received signal SIGFPE, Arithmetic exception.
cub::GridEvenShare::DispatchInit ()
at /opt/nvidia/hpc_sdk/Linux_x86_64/20.5/compilers/include-stdpar/cub/device/dispatch/…/…/agent/…/grid/grid_even_share.cuh:134
134 OffsetT avg_tiles_per_block = total_tiles / grid_size;

Backtrace:
#0 cub::GridEvenShare::DispatchInit () at /opt/nvidia/hpc_sdk/Linux_x86_64/20.5/compilers/include-stdpar/cub/device/dispatch/…/…/agent/…/grid/grid_even_share.cuh:134
#1 cub::DispatchRadixSort<false, double, cub::NullType, int, cub::DeviceRadixSortPolicy<double, cub::NullType, int> >::PassConfig<void ()(double const, int*, int, int, int, cub::GridEvenShare), void ()(int, int), void ()(double const, double*, cub::NullType const*, cub::NullType*, int*, int, int, int, cub::GridEvenShare)>::InitPassConfig<cub::AgentRadixSortUpsweepPolicy<256, 23, double, (cub::CacheLoadModifier)0, 7, cub::RegBoundScaling<256, 23, double> >, cub::AgentScanPolicy<512, 23, int, (cub::BlockLoadAlgorithm)3, (cub::CacheLoadModifier)0, (cub::BlockStoreAlgorithm)3, (cub::BlockScanAlgorithm)1, cub::MemBoundScaling<512, 23, int> >, cub::AgentRadixSortDownsweepPolicy<512, 23, double, (cub::BlockLoadAlgorithm)2, (cub::CacheLoadModifier)0, (cub::RadixRankAlgorithm)2, (cub::BlockScanAlgorithm)2, 7, cub::RegBoundScaling<512, 23, double> > > ()
at /opt/nvidia/hpc_sdk/Linux_x86_64/20.5/compilers/include-stdpar/cub/device/dispatch/dispatch_radix_sort.cuh:1104
#2 cub::DispatchRadixSort<false, double, cub::NullType, int, cub::DeviceRadixSortPolicy<double, cub::NullType, int> >::InvokePasses<cub::DeviceRadixSortPolicy<double, cub::NullType, int>::Policy700, void ()(double const, int*, int, int, int, cub::GridEvenShare), void ()(int, int), void ()(double const, double*, cub::NullType const*, cub::NullType*, int*, int, int, int, cub::GridEvenShare)> () at /opt/nvidia/hpc_sdk/Linux_x86_64/20.5/compilers/include-stdpar/cub/device/dispatch/dispatch_radix_sort.cuh:1155
#3 cub::DispatchRadixSort<false, double, cub::NullType, int, cub::DeviceRadixSortPolicy<double, cub::NullType, int> >::Invoke<cub::DeviceRadixSortPolicy<double, cub::NullType, int>::Policy700> () at /opt/nvidia/hpc_sdk/Linux_x86_64/20.5/compilers/include-stdpar/cub/device/dispatch/dispatch_radix_sort.cuh:1265
#4 cub::ChainedPolicy<700, cub::DeviceRadixSortPolicy<double, cub::NullType, int>::Policy700, cub::DeviceRadixSortPolicy<double, cub::NullType, int>::Policy620>::Invoke<cub::DispatchRadixSort<false, double, cub::NullType, int, cub::DeviceRadixSortPolicy<double, cub::NullType, int> > > ()
at /opt/nvidia/hpc_sdk/Linux_x86_64/20.5/compilers/include-stdpar/cub/block/…/iterator/…/util_device.cuh:682
#5 cub::DispatchRadixSort<false, double, cub::NullType, int, cub::DeviceRadixSortPolicy<double, cub::NullType, int> >::Dispatch ()
at /opt/nvidia/hpc_sdk/Linux_x86_64/20.5/compilers/include-stdpar/cub/device/dispatch/dispatch_radix_sort.cuh:1311
#6 cub::DeviceRadixSort::SortKeys () at /opt/nvidia/hpc_sdk/Linux_x86_64/20.5/compilers/include-stdpar/cub/device/device_radix_sort.cuh:611
#7 0x000000000040ef8a in thrust::cuda_cub::__radix_sort::dispatch<thrust::detail::integral_constant<bool, false>, thrust::less >::doit<double, double, long> ()
at /opt/nvidia/hpc_sdk/Linux_x86_64/20.5/compilers/include-stdpar/thrust/system/cuda/detail/sort.h:1339
#8 ZN6thrust8cuda_cub12__radix_sort10radix_sortINS_6detail17integral_constantIbLb0EEENS3_22execute_with_allocatorINS_2mr9allocatorIcNS7_37disjoint_unsynchronized_pool_resourceINS_6system4cuda6detail20cuda_memory_resourceIXadL10cudaMallocEEXadL8cudaFreeEENS0_7pointerIvEEEENS7_19new_delete_resourceEEEEENS0_22execute_on_stream_baseEEEddlNS_4lessIdEEEEvRNS0_16execution_policyIT0_EEPT1_PT2_T3_T4 () at /opt/nvidia/hpc_sdk/Linux_x86_64/20.5/compilers/include-stdpar/thrust/system/cuda/detail/sort.h:1452
#9 ZN6thrust8cuda_cub12__smart_sort10smart_sortINS_6detail17integral_constantIbLb0EEES5_NS3_22execute_with_allocatorINS_2mr9allocatorIcNS7_37disjoint_unsynchronized_pool_resourceINS_6system4cuda6detail20cuda_memory_resourceIXadL10cudaMallocEEXadL8cudaFreeEENS0_7pointerIvEEEENS7_19new_delete_resourceEEEEENS0_22execute_on_stream_baseEEEN9__gnu_cxx17__normal_iteratorIPdSt6vectorIdSaIdEEEESO_NS_4lessIdEEEENS1_24enable_if_primitive_sortIT2_T4_E4typeERNS0_16execution_policyIT1_EESW_SW_T3_SX ()
at /opt/nvidia/hpc_sdk/Linux_x86_64/20.5/compilers/include-stdpar/thrust/system/cuda/detail/sort.h:1588
#10 ZN6thrust8cuda_cub4sortINS_6detail22execute_with_allocatorINS_2mr9allocatorIcNS4_37disjoint_unsynchronized_pool_resourceINS_6system4cuda6detail20cuda_memory_resourceIXadL10cudaMallocEEXadL8cudaFreeEENS0_7pointerIvEEEENS4_19new_delete_resourceEEEEENS0_22execute_on_stream_baseEEEN9__gnu_cxx17__normal_iteratorIPdSt6vectorIdSaIdEEEENS_4lessIdEEEEvRNS0_16execution_policyIT_EET0_SW_T1 () at /opt/nvidia/hpc_sdk/Linux_x86_64/20.5/compilers/include-stdpar/thrust/system/cuda/detail/sort.h:1621
#11 0x000000000040395a in ZN6thrust8cuda_cub4sortINS_6detail22execute_with_allocatorINS_2mr9allocatorIcNS4_37disjoint_unsynchronized_pool_resourceINS_6system4cuda6detail20cuda_memory_resourceIXadL10cudaMallocEEXadL8cudaFreeEENS0_7pointerIvEEEENS4_19new_delete_resourceEEEEENS0_22execute_on_stream_baseEEEN9__gnu_cxx17__normal_iteratorIPdSt6vectorIdSaIdEEEEEEvRNS0_16execution_policyIT_EET0_SU () at /opt/nvidia/hpc_sdk/Linux_x86_64/20.5/compilers/include-stdpar/thrust/system/cuda/detail/sort.h:1712
#12 ZN6thrust4sortINS_6detail22execute_with_allocatorINS_2mr9allocatorIcNS3_37disjoint_unsynchronized_pool_resourceINS_6system4cuda6detail20cuda_memory_resourceIXadL10cudaMallocEEXadL8cudaFreeEENS_8cuda_cub7pointerIvEEEENS3_19new_delete_resourceEEEEENSA_22execute_on_stream_baseEEEN9__gnu_cxx17__normal_iteratorIPdSt6vectorIdSaIdEEEEEEvRKNS1_21execution_policy_baseIT_EET0_SV () at /opt/nvidia/hpc_sdk/Linux_x86_64/20.5/compilers/include-stdpar/thrust/detail/sort.inl:41
#13 std::__pstl::__algorithm_wrapper_struct::sort<__gnu_cxx::__normal_iterator<double*, std::vector<double, std::allocator > > > ()
at /opt/nvidia/hpc_sdk/Linux_x86_64/20.5/compilers/include/nvhpc/algorithm_execution.hpp:1790
#14 0x00000000004037b1 in std::sort<std::execution::parallel_policy&, __gnu_cxx::__normal_iterator<double*, std::vector<double, std::allocator > > > ()
at /opt/nvidia/hpc_sdk/Linux_x86_64/20.5/compilers/include/nvhpc/algorithm_execution.hpp:2831

This looks like a problem with the compiler. I have logged the issue and we’ll have a compiler person look at it. Thanks for the report.

Hi Axel,

This was addressed in the NVIDIA HPC SDK version 20.9, and should be working now. Thanks for the report.