I have a piece of code that theoretically performs an axpy using c++ stdpar. (I recognize that a sane person would not write it this way; I’m reproducing an issue I saw in an existing code I’m porting.)
#include<algorithm>
#include<cassert>
#include<execution>
#include<functional>
#include<ranges>
#include<vector>
struct data {
int n;
double *x, *y, *z;
};
void do_stuff(data* arrays) {
auto groups = std::ranges::views::iota(0, arrays->n);
std::for_each(std::execution::par_unseq, groups.begin(), groups.end(), [=](int g) {
arrays->z[g] = arrays->x[g] + arrays->y[g];
});
}
int main(int argc, char* argv[]) {
using std::execution::par_unseq;
using std::vector;
data arrays;
arrays.n = 100'000'000;
arrays.x = new double[arrays.n];
arrays.y = new double[arrays.n];
arrays.z = new double[arrays.n];
// Initialize x and y
for(int i=0; i<arrays.n; i++) {
arrays.x[i] = i;
arrays.y[i] = arrays.n-i;
}
// Compute sum of x and y
//std::transform(par_unseq, x.cbegin(), x.cend(), y.cbegin(),
// z.begin(), std::plus<>{});
do_stuff(&arrays);
// Assert that the sum is correct
for(int i=0; i<arrays.n; i++) {
assert(arrays.z[i] == arrays.n);
}
delete[] arrays.x;
delete[] arrays.y;
delete[] arrays.z;
return 0;
}
I compile with nvc++ -stdpar -Minfo -std=c++20 main.cpp -o axpy
, and this code crashes at runtime with a
CUDA Exception: Warp Illegal Address
The exception was triggered at PC 0x155527836140
Thread 1 "axpy" received signal CUDA_EXCEPTION_14, Warp Illegal Address.
[Switching focus to CUDA kernel 0, grid 1, block (0,0,0), thread (128,0,0), device 0, sm 0, warp 4, lane 0]
0x0000155527836180 in void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__parallel_for::ParallelForAgent<thrust::cuda_cub::for_each_f<std::ranges::iota_view<int, int>::_Iterator, thrust::detail::wrapped_function<do_stuff(data*)::{lambda(int)#1}, void> >, decltype ((std::ranges::__detail::__to_signed_like<int>)((declval<int>)()))>, thrust::cuda_cub::for_each_f<std::ranges::iota_view<int, int>::_Iterator, thrust::detail::wrapped_function<do_stuff(data*)::{lambda(int)#1}, void> >, decltype ((std::ranges::__detail::__to_signed_like<int>)((declval<int>)()))>(thrust::cuda_cub::for_each_f<std::ranges::iota_view<int, int>::_Iterator, thrust::detail::wrapped_function<do_stuff(data*)::{lambda(int)#1}, void> >, decltype ((std::ranges::__detail::__to_signed_like<int>)((declval<int>)())))<<<(195313,1,1),(256,1,1)>>> () at /home/klinalic/gpu-programming-models/axpy/main.cpp:16 in do_stuff(data*)::{lambda(int)#1}::operator() inlined from function.h:125
16 arrays->z[g] = arrays->x[g] + arrays->y[g];
Interestingly, if I change data arrays;
to data* arrays = new data;
(and propagate that change through the rest of the code), the code works fine. Is this because the second is allocating memory on the heap? Is this intended behavior?