cudaErrorIllegalAddress(700) and I can't figure out why

I’m creating a basic particle simulation in CUDA/C++ as part of a university assignment. The other part of the assignment was to make the same particle system in Rust making use of threading. Now, I need to do the same thing but making use of the GPU in CUDA.

I’m having some problems with it that just aren’t really making sense to me, I’m doing things exactly how we’ve done them before for lab assignments and yet I’m facing issues that never came up before. I keep getting an error that I’m attempting to write to an invalid memory address and I’m fairly certain it all comes down to my implementation of the function “update_particles_GPU”.

Originally, I was using device managed memory (initialising the arrays on the gpu and not having to worry about copying the data back and forth between host and device) but started facing this illegalMemoryAddress problem, this was also being done in a while loop that was locked to 60fps and had multiple aerosol cans, thousands of particles, etc so I tried to go simple with it and try doing just one can, with a small amount of particles, no simulation loop and manually copying the data to see if the problem became any clearer but… it didn’t. I don’t really know how to approach this problem anymore.

I also tried adding prints to the kernel function to see which thread it was throwing an error on but this did little to help me.

Another strange thing to note is how different thread configurations seem to work and others don’t. For example, 1 block of 3333 threads works when the particles are only updated once, but when this is done in a simulation loop, where particles are created and updated every frame, it eventually tries to access an invalid memory location.

Same with 1, 2000.

2, 1000 has an immediate error.

And using num_blocks, block_size also has an immediate error.

Breadkown of the code:

  • Create AerosolCan “red_can”

  • Create a null pointer “dev_red_particles”

  • Spray red can (creates a batch of particles and adds them to the red_can’s particles array) ← possible point of error

  • Allocate memory on GPU

  • Copy host “red_can.particles” (array) to device

  • Kernel function “update_particles_GPU” ← possible point of error

  • cudaDeviceSynchronize

  • Copy updated device array back to host

  • Print updated particle positions

The code:

   #define _USE_MATH_DEFINES
    #include <iostream>
    #include <math.h>
    
    #include "cuda_runtime.h"
    #include "device_launch_parameters.h"
    #include <chrono>
    #include <thread>
    #include <stdio.h>
    #include <vector>
    #include <windows.h>
    
    const float DRAG = 0.05;
    const float GRAVITY = 9.8;
    const float BLEND = 0.1;
    
    struct Colour {
        float red = 1;
        float green = 1;
        float blue = 1;
    };
    
    struct Particle {
        Colour colour = Colour();
        float x = 0;
        float y = 0;
        float z = 0;
        float velocity_x = 0;
        float velocity_y = 0;
        float velocity_z = 0;
        bool collided = false;
        bool landed_on_paper = false;
    };
    
    const int max_particles_per_can = 3333;
    
    __global__ void update_particles_GPU(Particle* particles, const float* time_step, const uint32_t* particles_created) {
        int i = blockIdx.x * blockDim.x + threadIdx.x;
        //printf("index : (%d)", i);
    
        if (i > *particles_created) {
            //printf("\n");
            return;
        }
    
        Particle* particle;
        if (&particles[i] == NULL) {
            //printf("\n");
            return;
        }
        else {
            particle = &particles[i];
        }
    
        if (!particle->collided) {
            float GRAVITY = 9.8;
            float DRAG = 0.05;
            float acceleration_x = -DRAG * (particle->velocity_x * particle->velocity_x);
            float distance_x = particle->velocity_x * *time_step + 0.5 * acceleration_x * (*time_step * *time_step);
    
            float acceleration_y = GRAVITY - DRAG * (particle->velocity_y * particle->velocity_y);
            float distance_y = particle->velocity_y * *time_step + 0.5 * acceleration_y * (*time_step * *time_step);
    
            float acceleration_z = -DRAG * (particle->velocity_z * particle->velocity_z);
            float distance_z = particle->velocity_z * *time_step + 0.5 * acceleration_z * (*time_step * *time_step);
    
            particle->x += distance_x;
            particle->y += distance_y;
            particle->z += distance_z;
    
            if (particle->y < 0) {
                particle->y = 0;
            }
    
            if (particle->velocity_x < 0) {
                particle->velocity_x += -acceleration_x * *time_step;
            }
            else {
                particle->velocity_x += acceleration_x * *time_step;
            }
    
            particle->velocity_y += -acceleration_y * *time_step;
    
            if (particle->velocity_z < 0) {
                particle->velocity_z += -acceleration_z * *time_step;
            }
            else {
                particle->velocity_z += acceleration_z * *time_step;
            }
    
            // Collision
            if (particle->y == 0) {
                particle->collided = true;
            }
        }
        //printf("\n");
    }
    
    struct AerosolCan {
        //std::vector<Particle> particles = std::vector<Particle>();
        Particle* particles = new Particle[max_particles_per_can];
    
        float x = 0;
        float y = 0;
        float z = 0;
        float base_velocity_x = 0;
        float base_velocity_y = 0;
        float base_velocity_z = 0;
        Colour colour = Colour();
        float spray_radius = 0;
        uint32_t particles_created = 0;
    
    void print_particles() {
            int index = 1;
            
            for (int i = 0; i < particles_created; i++) {
                Particle* particle = &particles[i];
                std::cout << "Particle " << i + 1 << " | X: " << particle->x << " | Y: "
                    << particle->y << " | Z: " << particle->z << " | Hit = " << particle->landed_on_paper << std::endl;
            }
            
            std::cout << "" << std::endl;
        }
    
        void spray(Particle* particles, uint32_t number_of_particles) {
            float radius = spray_radius;
            while (radius > 0.0 && number_of_particles > 0) {
                // Create new particles
                for (int i = 0; i < number_of_particles; i++) {
                    float horizontal_angle = i / number_of_particles * 2.0 * M_PI;
                    float vertical_angle = i / number_of_particles * M_PI;
    
                    float new_x = x + radius * cos(horizontal_angle) * sin(vertical_angle);
                    float new_y = y + radius * sin(horizontal_angle) * sin(vertical_angle);
                    float new_z = z + radius * cos(vertical_angle);
    
                    Particle new_particle = Particle();
                    new_particle.colour = colour;
                    new_particle.x = new_x;
                    new_particle.y = new_y;
                    new_particle.z = new_z;
                    new_particle.velocity_x = base_velocity_x;
                    new_particle.velocity_y = base_velocity_y;
                    new_particle.velocity_z = base_velocity_z;
    
                    //reinterpret_cast<Particle*>(&particles)[particles_created] = new_particle;
                    particles[particles_created] = new_particle;
    
                    particles_created++;
                }
    
                radius = radius / 2.0;
                number_of_particles = number_of_particles / 2.0;
            }
        }
    };
    
    const int block_size = 256;
    
    int main() {
        int num_blocks = (max_particles_per_can + block_size - 1) / block_size;
        std::cout << "Blocks: " << num_blocks << " | Block size: " << block_size << std::endl;
    
        Colour red_colour;
        red_colour.red = 1;
        red_colour.green = 0;
        red_colour.blue = 0;
    
        AerosolCan red_can = AerosolCan();
        red_can.colour = red_colour;
        red_can.x = -25;
        red_can.y = 30;
        red_can.z = 60;
        red_can.base_velocity_x = 125;
        red_can.base_velocity_y = 10;
        red_can.base_velocity_z = 0;
        red_can.spray_radius = 15;
    
    Particle* dev_red_particles = nullptr;
    
        // Spray particles
        red_can.spray(red_can.particles, 5);
    
        // Copy data to GPU
        cudaError_t cuda_status;
        cuda_status = cudaMalloc((void**)&dev_red_particles, max_particles_per_can * sizeof(Particle));
        if (cuda_status != cudaSuccess) {
            fprintf(stderr, "cudaMalloc failed");
            goto Error;
        }
    
        cuda_status = cudaMemcpy(dev_red_particles, red_can.particles, max_particles_per_can * sizeof(Particle), cudaMemcpyHostToDevice);
        if (cuda_status != cudaSuccess) {
            fprintf(stderr, "cudaMemcpy failed");
            goto Error;
        }
    
        float time_step = 0.001;
    
        // Update particles
        update_particles_GPU << <num_blocks, block_size >> > (dev_red_particles, &time_step, &red_can.particles_created);
    
        cuda_status = cudaDeviceSynchronize();
        if (cuda_status != cudaSuccess) {
            fprintf(stderr, "cudaSync failed");
            goto Error;
        }
    
        // Copy data back to host
        cuda_status = cudaMemcpy(red_can.particles, dev_red_particles, max_particles_per_can * sizeof(Particle), cudaMemcpyDeviceToHost);
        if (cuda_status != cudaSuccess) {
            fprintf(stderr, "cudaMemcpy failed");
            goto Error;
        }
    
        // Print particles
        red_can.print_particles();
    
        Error:
            cudaFree(dev_red_particles);
    
        return 0;
    }