For_each: failed to synchronize: cudaErrorIllegalAddress: an illegal memory access was encountered

Cuda: 11.2 GPU: NVIDIA T4 OS: Ubuntu 16.04 C++14
When I tried to run my project, this error occurred.
In real code, I did very simply funtion but it failed:

terminate called after throwing an instance of 'thrust::system::system_error'
  what():  for_each: failed to synchronize: cudaErrorIllegalAddress: an illegal memory access was encountered
Aborted (core dumped)

My project only contains four major files (main.cu, Grid.cu, Grid.h, Particle.h):
Compile command:

nvcc -o out -g -std=c++14 -O2 --expt-extended-lambda --expt-relaxed-constexpr -DTHRUST_DEBUG -arch=sm_70 -Wno-deprecated-declarations -Xcudafe --diag_suppress=esa_on_defaulted_function_ignored -rdc=true main.cu Grid.cu Matrix2D.cu Particle.cu Vector2D.cu

main.cu:

#include <iostream>

#include <vector>

#include "Constants.h"

#include "Particle.h"

#include "Grid.h"

using namespace std;

Grid *grid;

__host__ void init() {

    vector<Particle> p;

    for (int i = 0; i < 10000; ++i)

        p.push_back(Particle());

    grid = new Grid(Vector2D(0, 0), Vector2D(WIN_METERS_X, WIN_METERS_Y), Vector2D(256, 128), &p);

    grid->initGridMassVel();

}

__host__ void update() {

    

}

int main()

{

    init();

    return 0;

}

Grid.h:

#ifndef GRID_H

#define GRID_H

#include "Constants.h"

#include "Vector2D.h"

#include "Particle.h"

#include <cmath>

#include <cstring>

#include <vector>

#include <thrust/functional.h>

#include <thrust/execution_policy.h>

#include <thrust/for_each.h>

#include <cuda.h>

#include <cuda_runtime.h>

#include <thrust/copy.h>

#include <thrust/device_vector.h>

using namespace std;

struct Node {

    double mass;

    int active;

    Vector2D pos, vel, vel_new, force;

    __host__ __device__ Node() {

        mass = 0;

        active = 0;

        pos = vel = vel_new = force = Vector2D();

    }

};

class Grid

{

public:

    // Grid origin and size

    Vector2D origin, size;

    

    // Grid nodes

    Vector2D node_size;

    double node_area;

    int nodes_length;

    __host__ Grid(Vector2D pos, Vector2D dims, Vector2D window_size, vector<Particle>* _particles) :

        origin(pos), size(window_size) {

        node_size = dims / window_size;

        size.x += 1, size.y += 1;

        nodes_length = int(size.x * size.y);

        node_area = node_size.x * node_size.y;

        particles.resize(_particles->size());

        thrust::copy(_particles->begin(), _particles->end(), particles.begin());

        

        nodes.resize(nodes_length);

        vector<Node> temp_nodes;

        temp_nodes.resize(nodes_length);

        for (int y = 0; y < size.y; ++y) {

            for (int x = 0; x < size.x; ++x) {

                int node_id = int(y * size.x + x);

                temp_nodes[node_id].pos = Vector2D(x, y);

            }

        }

        thrust::copy(temp_nodes.begin(), temp_nodes.end(), nodes.begin());

    }

    // Map particles to Grid

    __host__ void initGridMassVel();

private:

    thrust::device_vector<Particle> particles;

    thrust::device_vector<Node> nodes;

};

#endif

Grid.cu:

#include "Grid.h"

#include <cassert>

#include <cstdio>

#define MATRIX_EPSILON 1e-6

__host__ __device__ double bspline(double x) {

    x = fabs(x);

    double w;

    if (x < 1)

        w = x * x * (x / 2 - 1) + 2 / 3.0;

    else if (x < 2)

        w = x * (x * (-x / 6 + 1) - 2) + 4 / 3.0;

    else return 0;

    return w;

}

//Slope of interpolation function

__host__ __device__ double bsplineSlope(double x) {

    double abs_x = fabs(x);

    if (abs_x < 1)

        return 1.5 * x * abs_x - 2 * x;

    else if (x < 2)

        return -x * abs_x / 2 + 2 * x - 2 * x / abs_x;

    else return 0;

}

__device__ double my_atomicAdd(double* address, double val)

{

    unsigned long long int* address_as_ull =

                              (unsigned long long int*)address;

    unsigned long long int old = *address_as_ull, assumed;

    do {

        assumed = old;

        old = atomicCAS(address_as_ull, assumed,

                        __double_as_longlong(val +

                               __longlong_as_double(assumed)));

    // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)

    } while (assumed != old);

    return __longlong_as_double(old);

}

__host__ void Grid::initGridMassVel() {

    // Map particle to grid

    Node* grid_ptr = thrust::raw_pointer_cast(&nodes[0]);

    auto func = [=] __device__ (Particle & p) {

        // get the index of the grid cross point corresponding to the particle (it is on the bottom left of the particle)

        p.grid_p = (p.pos - origin) / node_size;

        int p_x = (int)p.grid_p.x;  // x coord index in grid

        int p_y = (int)p.grid_p.y;  // y coord index in grid

        //printf("P: %d %d\n", p_x, p_y);

        // Map from (p_x - 1, p_y - 1) to (p_x + 2, p_y + 2)

        // The origin is bottom left, which means node_id = y * size.x + x

        for (int it = 0, y = p_y - 1; y <= p_y + 2; ++y) {

            if (y < 0 || y >= size.y) // here size.y has already been added by 1

                continue;

            // Y interpolation

            double weight_y = bspline(p.grid_p.y - y);

            double dy = bsplineSlope(p.grid_p.y - y);

            for (int x = p_x - 1; x <= p_x + 2; ++x, ++it) {

                if (x < 0 || x >= size.x)

                    continue;

                // X interpolation

                double weight_x = bspline(p.grid_p.x - x);

                double dx = bsplineSlope(p.grid_p.x - x);

                // set weight of particles related nodes

                double w = weight_x * weight_y;

                p.weights[it] = w;

                // set weight gradient

                p.weight_gradient[it] = Vector2D(dx * weight_y, dy * weight_x);

                p.weight_gradient[it] /= node_size;

                // set node weighted mass and velocity

                int node_id = int(y * size.x + x);

                //nodes[node_id].mass += w * p.mass;

                my_atomicAdd(&(grid_ptr[node_id].mass), w * p.mass);

                //nodes[node_id].vel += p.vel * w * p.mass;

                Vector2D temp = p.vel * w * p.mass;

                my_atomicAdd(&(grid_ptr[node_id].vel.x), temp.x);

                my_atomicAdd(&(grid_ptr[node_id].vel.y), temp.y);

                //nodes[node_id].active = true;

                atomicAdd(&(grid_ptr[node_id].active), 1);

            }

        }

    };

    thrust::for_each(thrust::device, particles.begin(), particles.end(), func);

    thrust::for_each(

        thrust::device,

        nodes.begin(),

        nodes.end(),

        [=] __device__(Node& n) {

            if (n.active)

                n.vel /= n.mass;

        }

    );

}

Particle.h:

#include <cstring>

#include "Vector2D.h"

#include "Matrix2D.h"

#include <cuda.h>

#include <cuda_runtime.h>

using namespace std;

class Particle

{

public:

    double mass, density, volume;

    Vector2D pos, vel;

    Matrix2D velocity_gradient;

    Matrix2D deformation_gradient, plastic_deformation, elastic_deformation;

    Matrix2D stress;

    

    // assigned grid index

    Vector2D grid_p;

    // weight value for nearest 16 grid nodes

    double weights[16];

    Vector2D weight_gradient[16];

    // TODO: need more variables here like deformation;

    __host__ __device__ Particle() {};

    

    __host__ __device__ Particle(const Vector2D& pos, const Vector2D& vel, double mass) :

        pos(pos), vel(vel), mass(mass) {

        // TODO: more varibles means re-write init function

        this->deformation_gradient = Matrix2D();  // init as identity matrix

        this->plastic_deformation = Matrix2D();  // init as identity matrix

        this->elastic_deformation = Matrix2D();  // init as identity matrix

        this->volume = 1;  // init volume is 1 (a unit volume)

        memset(weights, 0, sizeof(double) * 16);

        memset(weight_gradient, 0, sizeof(Vector2D) * 16);

    }

};

some suggestions:

  • Don’t post pictures of code. Post the code as formatted text. Use the </> button after selecting your code, is one possible approach.
  • Fix the formatting of your text code. See above.
  • Provide a complete example. For example, I don’t see the Particle struct definition.
  • To repeat, provide a complete example. Provide the missing struct definitions, a main routine, your includes, and something that calls your code. Based on what I see here, it should not require more than about a dozen or so additional lines of code beyond what you have posted.
  • Then test the code you posted yourself. Start a new empty project. Put only your posted code in it. Compile it and run it. If you can’t compile it, add whatever is missing to your post. If when you run it, it doesn’t show the error, then keep working on it until it does show the error.

Thank you so much for replying! I edited the post. Could you please see it again? The calling is really simple. Actually, when initializing the Simulator. grid = new Grid() worked, which means that thrust::copy worked for my code. But when it steped into grid->initGridMass(), it failed cause of illegal access of memory address.

I tried an empty new folder and pasted minimal codes I needed. I edited the original post again. Besides those four files, I also have Matrix2D.h, Matrix2D.cu, Vector2D.h. It seems I cannot paste more codes in the post so I added here:

Matrix2D.h:

#ifndef MATRIX_2D_H

#define MATRIX_2D_H

#include "Vector2D.h"

#include <cmath>

#include <cuda.h>

#include <cuda_runtime.h>

class Matrix2D

{

public:

    // Default Matrix = Identity Matrix

    __host__ __device__ Matrix2D(double val = 1.) {

        for (int i = 0; i < 2; ++i)

            for (int j = 0; j < 2; ++j)

                (*this)(i, j) = (i == j) ? val : 0.;

    }

    __host__ __device__ Matrix2D(double* data) {

        for (int i = 0; i < 2; ++i)

            for (int j = 0; j < 2; ++j)

                (*this)(i, j) = data[i * 2 + j];

    }

    __host__ __device__ Matrix2D(double m00, double m01,

        double m10, double m11) {

        (*this)(0, 0) = m00; (*this)(0, 1) = m01;

        (*this)(1, 0) = m10; (*this)(1, 1) = m11;

    }

    // set all values to val (detaul: 0.)

    __host__ __device__ void zero(double val = 0.);

    // determinant

    __host__ __device__ double det(void) const;

    // Return the column vector

    __host__ __device__ Vector2D& column(int i);

    __host__ __device__ const Vector2D& column(int i) const;

    // Transpose

    __host__ __device__ Matrix2D T(void) const;

    // Inverse

    __host__ __device__ Matrix2D inv(void) const;

    // accesses element (i,j) of A using 0-based indexing

    __host__ __device__ double& operator()(int i, int j);

    __host__ __device__ const double& operator()(int i, int j) const;

    // accesses the ith column of A

    __host__ __device__ Vector2D& operator[](int i);

    __host__ __device__ const Vector2D& operator[](int i) const;

    // increments by B

    __host__ __device__ void operator+=(const Matrix2D& B);

    // A+c

    __host__ __device__ Matrix2D operator+(const Matrix2D& B) const;

    // returns -A

    __host__ __device__ Matrix2D operator-(void) const;

    // returns A-B

    __host__ __device__ Matrix2D operator-(const Matrix2D& B) const;

    // returns c*A

    __host__ __device__ Matrix2D operator*(double c) const;

    // returns A*B

    __host__ __device__ Matrix2D operator*(const Matrix2D& B) const;

    // returns A*x

    __host__ __device__ Vector2D operator*(const Vector2D& x) const;

    // divides each element by x

    __host__ __device__ void operator/=(double x);

protected:

    Vector2D entries[2];

};

#endif

Matrix.cu:

#include "Matrix2D.h"

__host__ __device__ double& Matrix2D::operator() (int i, int j) {

    return entries[j][i];

}

__host__ __device__ const double& Matrix2D::operator() (int i, int j) const {

    return entries[j][i];

}

__host__ __device__ Vector2D& Matrix2D::operator[] (int j) {

    return entries[j];

}

__host__ __device__ const Vector2D& Matrix2D::operator[] (int j) const {

    return entries[j];

}

__host__ __device__ void Matrix2D::zero(double val) {

    entries[0] = entries[1] = Vector2D(val, val);

}

__host__ __device__ double Matrix2D::det(void) const {

    const Matrix2D& A(*this);

    return A(0, 0) * A(1, 1) - A(0, 1) * A(1, 0);

}

__host__ __device__ Matrix2D Matrix2D::T(void) const {

    const Matrix2D& A(*this);

    Matrix2D B;

    for (int i = 0; i < 2; ++i) {

        for (int j = 0; j < 2; ++j)

            B(i, j) = A(j, i);

    }

    return B;

}

__host__ __device__ Matrix2D Matrix2D::inv(void) const {

    const Matrix2D& A(*this);

    double d = det();

    return Matrix2D(A(1, 1) / d, -A(1, 0) / d, -A(0, 1) / d, A(0, 0) / d);

}

__host__ __device__ Vector2D& Matrix2D::column(int i) {

    return entries[i];

}

__host__ __device__ Matrix2D Matrix2D::operator-(void) const {

    const Matrix2D& A(*this);

    Matrix2D B;

    B[0] = -A[0];

    B[1] = -A[1];

    

    return B;

}

__host__ __device__ Matrix2D Matrix2D::operator-(const Matrix2D& B) const {

    const Matrix2D& A(*this);

    Matrix2D C;

    C[0] = A[0] - B[0];

    C[1] = A[1] - B[1];

    return C;

}

__host__ __device__ void Matrix2D::operator+=(const Matrix2D& B) {

    Matrix2D& A(*this);

    A[0] += B[0];

    A[1] += B[1];

}

__host__ __device__ Matrix2D Matrix2D::operator*(double c) const {

    const Matrix2D& A(*this);

    Matrix2D B;

    B[0] = A[0] * c;

    B[1] = A[1] * c;

    return B;

}

__host__ __device__ Matrix2D Matrix2D::operator*(const Matrix2D& B) const {

    const Matrix2D& A(*this);

    Matrix2D C;

    C[0] = A * B[0];

    C[1] = A * B[1];

    return C;

}

__host__ __device__ Vector2D Matrix2D::operator*(const Vector2D& x) const {

    return x.x * entries[0] + x.y * entries[1];

}

__host__ __device__ void Matrix2D::operator/=(double x) {

    Matrix2D& A(*this);

    double rx = 1. / x;

    A[0] *= rx;

    A[1] *= rx;

}

__host__ __device__ Matrix2D Matrix2D::operator+(const Matrix2D& B) const {

    const Matrix2D& A(*this);

    Matrix2D C;

    C[0] = A[0] + B[0];

    C[1] = A[1] + B[1];

    return C;

}

Vector2D.h:

#ifndef VECTOR2D_H

#define VECTOR2D_H

#include <cmath>

#include <cuda.h>

#include <cuda_runtime.h>

class Vector2D

{

public:

    double x, y;

    // Constructor

    __host__ __device__ Vector2D() : x(0.), y(0.) {}

    __host__ __device__ Vector2D(double x, double y) : x(x), y(y) {}

    __host__ __device__ Vector2D(const Vector2D& v) : x(v.x), y(v.y) {}

    //Operators

    // returns reference to the specified component (0-based indexing: x, y)

    __host__ __device__ inline double& operator[] (const int& index) {

        return (&x)[index];

    }

    // returns const reference to the specified component (0-based indexing: x, y)

    __host__ __device__ inline const double& operator[] (const int& index) const {

        return (&x)[index];

    }

    // additive inverse

    __host__ __device__ inline Vector2D operator-(void) const {

        return Vector2D(-x, -y);

    }

    // addition

    __host__ __device__  inline Vector2D operator+(const Vector2D& v) const {

        Vector2D u = *this;

        u += v;

        return u;

    }

    // subtraction

    __host__ __device__  inline Vector2D operator-(const Vector2D& v) const {

        Vector2D u = *this;

        u -= v;

        return u;

    }

    // right scalar multiplication

    __host__ __device__  inline Vector2D operator*(double r) const {

        Vector2D vr = *this;

        vr *= r;

        return vr;

    }

    __host__ __device__  inline Vector2D operator*(const Vector2D& v) const {

        Vector2D vr = *this;

        vr.x *= v.x;

        vr.y *= v.y;

        return vr;

    }

    // scalar division

    __host__ __device__  inline Vector2D operator/(double r) const {

        Vector2D vr = *this;

        vr /= r;

        return vr;

    }

    __host__ __device__  inline Vector2D operator/(Vector2D v) const {

        Vector2D vr = *this;

        vr.x = vr.x / v.x;

        vr.y = vr.y / v.y;

        return vr;

    }

    // add v

    __host__ __device__  inline void operator+=(const Vector2D& v) {

        x += v.x;

        y += v.y;

    }

    // subtract v

    __host__ __device__  inline void operator-=(const Vector2D& v) {

        x -= v.x;

        y -= v.y;

    }

    // scalar multiply by r

    __host__ __device__  inline void operator*=(double r) {

        x *= r;

        y *= r;

    }

    // scalar divide by r

    __host__ __device__  inline void operator/=(double r) {

        x /= r;

        y /= r;

    }

    __host__ __device__  inline void operator/=(Vector2D v) {

        *this = *this / v;

    }

    /**

     * Returns norm.

     */

    __host__ __device__  inline double norm(void) const {

        return sqrt(x * x + y * y);

    }

    /**

     * Returns norm squared.

     */

    __host__ __device__  inline double norm2(void) const {

        return x * x + y * y;

    }

    /**

     * Returns unit vector parallel to this one.

     */

    __host__ __device__  inline Vector2D unit(void) const {

        return *this / this->norm();

    }

}; // clasd Vector2D

// left scalar multiplication

__host__ __device__ inline Vector2D operator*(double r, const Vector2D& v) {

    return v * r;

}

// inner product

__host__ __device__ inline double dot(const Vector2D& v1, const Vector2D& v2) {

    return v1.x * v2.x + v1.y * v2.y;

}

// cross product

__host__ __device__ inline double cross(const Vector2D& v1, const Vector2D& v2) {

    return v1.x * v2.y - v1.y * v2.x;

}

#endif

Sorry for your time. Really need help with this project.

Sorry, I’m not able to compile the code you have shown. Perhaps someone else will be able to help you.