Is this error looking like Thrust/CUDA is hitting some memory limit?

I have somehow managed to make my code compiled using CUDA as Thrust’s backend. However, I kept running into seg-fault by running the application. The error looks roughly like this:

terminate called after throwing an instance of 'thrust::system::system_error'
  what():  device free failed: an illegal memory access was encountered

Further performing cuda-memcheck for the application looks something horribly wrong:

========= Invalid __global__ read of size 8
=========     at 0x000007e8 in void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__parallel_for::ParallelForAgent<thrust::cuda_cub::__transform::binary_transform_f<thrust::detail::normal_iterator<thrust::device_ptr<thrust::device_ptr<Geometry::Point<long=2>>>>, thrust::detail::normal_iterator<thrust::device_ptr<int>>, thrust::detail::normal_iterator<thrust::device_ptr<Centroid<int=2>>>, thrust::cuda_cub::__transform::no_stencil_tag, VolfracEvaluator<int=2>, thrust::cuda_cub::__transform::always_true_predicate>, long>, thrust::cuda_cub::__transform::binary_transform_f<thrust::detail::normal_iterator<thrust::device_ptr<thrust::device_ptr<Geometry::Point<long=2>>>>, thrust::detail::normal_iterator<thrust::device_ptr<int>>, thrust::detail::normal_iterator<thrust::device_ptr<Centroid<int=2>>>, thrust::cuda_cub::__transform::no_stencil_tag, VolumeEvaluator<int=2>, thrust::cuda_cub::__transform::always_true_predicate>, long>(Geometry::Point<long=2>, thrust::device_ptr<Geometry::Point<long=2>>)
=========     by thread (0,0,0) in block (0,0,0)
=========     Address 0x5572d3abf8d0 is out of bounds
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 (cuLaunchKernel + 0x2cd) [0x22b40d]
=========     Host Frame:./volume-cuda [0x2f36b]
=========     Host Frame:./volume-cuda [0x4c5be]
=========     Host Frame:./volume-cuda [0x16c5a]
=========     Host Frame:./volume-cuda [0x8e17]
=========     Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xe7) [0x21b97]
=========     Host Frame:./volume-cuda [0x91fa]
=========
========= Program hit cudaErrorLaunchFailure (error 4) due to "unspecified launch failure" on CUDA API call to cudaFree. 
=========     Saved host backtrace up to driver entry point at error
=========     Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 [0x311633]
=========     Host Frame:./volume-cuda [0x5a2d0]
=========     Host Frame:./volume-cuda [0x950d]
=========     Host Frame:./volume-cuda [0x8e2c]
=========     Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xe7) [0x21b97]
=========     Host Frame:./volume-cuda [0x91fa]
=========
terminate called after throwing an instance of 'thrust::system::system_error'
  what():  device free failed: unspecified launch failure
========= Error: process didn't terminate successfully
========= No CUDA-MEMCHECK results found

The first line (========= Invalid global read of size 8) repeats 9 times in the error message from cuda-memcheck. It keeps saying that it’s out of bound… It clearly shouldn’t as I shall try to explain further down…

As things get desperate, I have been trying to comment things to pinpoint the bug. Weirdly, what I found was something that I don’t even know how to google. In an attempt of trying to illustrate the problem, a simplified version would look something along the line of:

#include "thrust/device_vector.h"
#include "thrust/device_ptr.h"
#include "thrust/transform.h"
#include "thrust/for_each.h"

struct Point {
  double x;
  double y;
};

__host__ __device__
bool CheckPoint(thrust::device_ptr<Point> const setP, int nP,
                Point const& testP) {
  for (int i = 0; i < nP-1; i++)
    {
      Point const& p0 = setP[i];
      Point const& p1 = setP[i+1];
      double x0 = p0.x;
      double y0 = p0.y;
      double x1 = p1.x;
      double y1 = p1.y;
      // further calculation                                                                                                                                                                                       
    }
  return true;
};

struct SingleFunctor  : public thrust::unary_function<int, int> {
  __host__ __device__
    int operator()(int i) { return 2*i;}
};

class DualFunctor : public thrust::binary_function<thrust::device_ptr<Point>, int, int> {
 public:
  __host__ __device__
  int operator()(thrust::device_ptr<Point> setP, int np) {
    Point testPoint;
    testPoint.x = 0;
    testPoint.y = 0;

    // Do some dummy stuff for an excuse to call another functor                                                                                                                                                   
    int p[1000], q[1000];
    thrust::device_ptr<int> p_ptr(p), q_ptr(q);
    thrust::transform(thrust::device, p_ptr, p_ptr+1000, q_ptr, SingleFunctor());

    // Then perform the second check                                                                                                                                                                               
    CheckPoint(&(setP[0]), np, testPoint);

    return 5;
  }
};

int main(int argc, char *argv[]) {
  int ncells = 100;

  DualFunctor myfunctor1;

  thrust::device_vector<int> nPoints(ncells);
  thrust::host_vector<thrust::host_vector<Point>> setPoints(ncells);
  thrust::device_vector<thrust::device_ptr<Point>> setPoints_ptr(ncells);

  for (int i = 0; i < ncells; i++)
    {
      nPoints[i] = i+1;
      setPoints[i].resize(i+1);
      for (int j = 0; j < i+1; j++)
        {
          setPoints[i][j].x = j;
          setPoints[i][j].y = j;
        }
      setPoints_ptr[i] = thrust::device_pointer_cast(setPoints[i].data());
    }

  thrust::device_vector<int> outvec(ncells);

  thrust::transform(setPoints_ptr.begin(), setPoints_ptr.end(), nPoints.begin(),
                    outvec.begin(),
                    myfunctor1);

  return 1;
}

This sample piece of code works without any problems - compiled and can be executed. However, using this as a skeleton for the bigger code I’m looking at, the problem lies inside the for loop of bool CheckPoint. If I comment out one of the line of declaration of Point const& (either p0 or p1), the big code will work. Leaving both in and I face the segfault.

Any recommendation? Have I crossed one too many lines this time?