I am using Nvida HPC SDK (2022) to complie the follow code, the basic propuse of which is to sum a N*M matrix into a N vector.
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/transform.h>
#include <thrust/execution_policy.h>
#include <thrust/execution_policy.h>
#include <thrust/fill.h>
constexpr unsigned int N = 2048, M = 2048;
int main(int argc, char* argv[]) {
thrust::device_vector<double> g_vec1(N*M);
thrust::device_vector<double> g_vec2(N);
thrust::fill(thrust::device, g_vec1.begin(),g_vec1.end(),1.);
thrust::device_vector<thrust::
device_vector<double>::iterator> g_it_vec(N);
for (int i=0; i<N; i++)
g_it_vec[i] = g_vec1.begin() + i*M;
thrust::transform(g_it_vec.begin(),g_it_vec.end(),g_vec2.begin(),
[](const auto& it) {
return thrust::reduce(thrust::device,
it, it+M,0.);});
}
when I run this code on the 3080Ti device, an error occurs when M > 2048 (or 1024 when I use complex):
temporary_buffer::allocate: get_temporary_buffer failed
…
temporary_buffer::allocate: get_temporary_buffer failed
terminate called after throwing an instance of ‘thrust::system::system_error’
what(): transform: failed to synchronize: cudaErrorLaunchFailure: unspecified launch failure
Aborted (core dumped)
How did this happen? Is it related to the 1024 maxium thread number of a block?
Or is there any standard means to reduce a matrix(2d array)?