I have somehow managed to make my code compiled using CUDA as Thrust’s backend. However, I kept running into seg-fault by running the application. The error looks roughly like this:
terminate called after throwing an instance of 'thrust::system::system_error'
what(): device free failed: an illegal memory access was encountered
Further performing cuda-memcheck for the application looks something horribly wrong:
========= Invalid __global__ read of size 8
========= at 0x000007e8 in void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__parallel_for::ParallelForAgent<thrust::cuda_cub::__transform::binary_transform_f<thrust::detail::normal_iterator<thrust::device_ptr<thrust::device_ptr<Geometry::Point<long=2>>>>, thrust::detail::normal_iterator<thrust::device_ptr<int>>, thrust::detail::normal_iterator<thrust::device_ptr<Centroid<int=2>>>, thrust::cuda_cub::__transform::no_stencil_tag, VolfracEvaluator<int=2>, thrust::cuda_cub::__transform::always_true_predicate>, long>, thrust::cuda_cub::__transform::binary_transform_f<thrust::detail::normal_iterator<thrust::device_ptr<thrust::device_ptr<Geometry::Point<long=2>>>>, thrust::detail::normal_iterator<thrust::device_ptr<int>>, thrust::detail::normal_iterator<thrust::device_ptr<Centroid<int=2>>>, thrust::cuda_cub::__transform::no_stencil_tag, VolumeEvaluator<int=2>, thrust::cuda_cub::__transform::always_true_predicate>, long>(Geometry::Point<long=2>, thrust::device_ptr<Geometry::Point<long=2>>)
========= by thread (0,0,0) in block (0,0,0)
========= Address 0x5572d3abf8d0 is out of bounds
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 (cuLaunchKernel + 0x2cd) [0x22b40d]
========= Host Frame:./volume-cuda [0x2f36b]
========= Host Frame:./volume-cuda [0x4c5be]
========= Host Frame:./volume-cuda [0x16c5a]
========= Host Frame:./volume-cuda [0x8e17]
========= Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xe7) [0x21b97]
========= Host Frame:./volume-cuda [0x91fa]
=========
========= Program hit cudaErrorLaunchFailure (error 4) due to "unspecified launch failure" on CUDA API call to cudaFree.
========= Saved host backtrace up to driver entry point at error
========= Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 [0x311633]
========= Host Frame:./volume-cuda [0x5a2d0]
========= Host Frame:./volume-cuda [0x950d]
========= Host Frame:./volume-cuda [0x8e2c]
========= Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xe7) [0x21b97]
========= Host Frame:./volume-cuda [0x91fa]
=========
terminate called after throwing an instance of 'thrust::system::system_error'
what(): device free failed: unspecified launch failure
========= Error: process didn't terminate successfully
========= No CUDA-MEMCHECK results found
The first line (========= Invalid global read of size 8) repeats 9 times in the error message from cuda-memcheck. It keeps saying that it’s out of bound… It clearly shouldn’t as I shall try to explain further down…
As things get desperate, I have been trying to comment things to pinpoint the bug. Weirdly, what I found was something that I don’t even know how to google. In an attempt of trying to illustrate the problem, a simplified version would look something along the line of:
#include "thrust/device_vector.h"
#include "thrust/device_ptr.h"
#include "thrust/transform.h"
#include "thrust/for_each.h"
struct Point {
double x;
double y;
};
__host__ __device__
bool CheckPoint(thrust::device_ptr<Point> const setP, int nP,
Point const& testP) {
for (int i = 0; i < nP-1; i++)
{
Point const& p0 = setP[i];
Point const& p1 = setP[i+1];
double x0 = p0.x;
double y0 = p0.y;
double x1 = p1.x;
double y1 = p1.y;
// further calculation
}
return true;
};
struct SingleFunctor : public thrust::unary_function<int, int> {
__host__ __device__
int operator()(int i) { return 2*i;}
};
class DualFunctor : public thrust::binary_function<thrust::device_ptr<Point>, int, int> {
public:
__host__ __device__
int operator()(thrust::device_ptr<Point> setP, int np) {
Point testPoint;
testPoint.x = 0;
testPoint.y = 0;
// Do some dummy stuff for an excuse to call another functor
int p[1000], q[1000];
thrust::device_ptr<int> p_ptr(p), q_ptr(q);
thrust::transform(thrust::device, p_ptr, p_ptr+1000, q_ptr, SingleFunctor());
// Then perform the second check
CheckPoint(&(setP[0]), np, testPoint);
return 5;
}
};
int main(int argc, char *argv[]) {
int ncells = 100;
DualFunctor myfunctor1;
thrust::device_vector<int> nPoints(ncells);
thrust::host_vector<thrust::host_vector<Point>> setPoints(ncells);
thrust::device_vector<thrust::device_ptr<Point>> setPoints_ptr(ncells);
for (int i = 0; i < ncells; i++)
{
nPoints[i] = i+1;
setPoints[i].resize(i+1);
for (int j = 0; j < i+1; j++)
{
setPoints[i][j].x = j;
setPoints[i][j].y = j;
}
setPoints_ptr[i] = thrust::device_pointer_cast(setPoints[i].data());
}
thrust::device_vector<int> outvec(ncells);
thrust::transform(setPoints_ptr.begin(), setPoints_ptr.end(), nPoints.begin(),
outvec.begin(),
myfunctor1);
return 1;
}
This sample piece of code works without any problems - compiled and can be executed. However, using this as a skeleton for the bigger code I’m looking at, the problem lies inside the for loop of bool CheckPoint. If I comment out one of the line of declaration of Point const& (either p0 or p1), the big code will work. Leaving both in and I face the segfault.
Any recommendation? Have I crossed one too many lines this time?