CUDA error: an illegal memory access was encountered (Address 0x0 is out of bounds)

h1ugroon · April 3, 2025, 4:01pm

I’m trying to build a forward function for my NEAT algorithm but compiler is giving an illegal memory access was encountered error. I tried to mem-check but there is nothing helpful

Output;

========= Invalid __global__ write of size 4 bytes
=========     at EvaluateNetworks(float *, Network *, int, int, int)+0x7c0
=========     by thread (96,0,0) in block (0,0,0)
=========     Address 0x0 is out of bounds
=========     and is 47.248.900.096 bytes before the nearest allocation at 0xb00410000 of size 65.536 bytes
=========     Saved host backtrace up to driver entry point at kernel launch time
=========         Host Frame: cuEventRecordWithFlags [0x7ff9aa88fbc6] in nvcuda64.dll
=========         Host Frame:  [0x16468] in a.exe
=========         Host Frame:  [0x16326] in a.exe
=========         Host Frame:  [0x16f61] in a.exe
=========         Host Frame:  [0x650b] in a.exe
=========         Host Frame:  [0x5829] in a.exe
=========         Host Frame:  [0x4c29] in a.exe
=========         Host Frame:  [0x5000] in a.exe
=========         Host Frame:  [0x2aa9c] in a.exe
=========         Host Frame: BaseThreadInitThunk [0x7ff9e7d07374] in KERNEL32.DLL
=========         Host Frame: RtlUserThreadStart [0x7ff9e8c3cc91] in ntdll.dll
=========
========= Program hit cudaErrorLaunchFailure (error 719) due to "unspecified launch failure" on CUDA API call to cudaDeviceSynchronize.
=========     Saved host backtrace up to driver entry point at error
=========         Host Frame: cuProfilerStop [0x7ff9aa9cc345] in nvcuda64.dll
=========         Host Frame:  [0x16a9c] in a.exe
=========         Host Frame:  [0x5006] in a.exe
=========         Host Frame:  [0x2aa9c] in a.exe
=========         Host Frame: BaseThreadInitThunk [0x7ff9e7d07374] in KERNEL32.DLL
=========         Host Frame: RtlUserThreadStart [0x7ff9e8c3cc91] in ntdll.dll
=========
GPUassert: unspecified launch failure a.cu 305

Structs:

struct Connection {
    int innovationid;
    int from;
    int to;
    float weight;
    //int type; // 0 input to hidden, 1 input to output, 2 hidden to hidden, 3 hidden to output
};

struct Neuron {
    int type;  // 0 input, 1 hidden, 2 output
    float input_sum;
    float bias;
    float output = 0.0f;
    int* incoming_connections;
    int id;
    int connected_num;
};

struct Network {
    Connection* Connections; 
    Neuron* Neurons;
    int output_ids[3];
    int num_neurons;
    int num_connections;
    float fitness;
};

Functions:

__device__ void GetNetworkOutput(float* rollingdata, Network* net, int* output_ids, int* result_id) { 
    float* outputs = new float[net->num_neurons];
    float* values = new float[net->num_neurons];
    int* completed = new int[net->num_connections];

    for (int i = 0; i < net->num_neurons; ++i) {
        if (net->Neurons[i].type == 0) {
            values[i] = rollingdata[i];
            outputs[i] = rollingdata[i];
        }

        else {
            values[i] = 0.0f;
            outputs[i] = 0.0f;
        }
    }

    bool finished = false;

    for (int i = 0; i < net->num_connections; ++i) {
        if (net->Neurons[net->Connections[i].from].type == 0) {
            values[net->Connections[i].to] += net->Connections[i].weight * values[net->Connections[i].from];
            completed[i] = 1;
        }

        else {
            completed[i] = 0;
        }
    }

    while (!finished) {
        finished = true;

        for (int i = 0; i < net->num_connections; ++i) {
            if (completed[i] == 0) {
                finished = false;

                if (values[net->Connections[i].from] != 0.0f) {
                    bool ehezowski = true;

                    for (int j = 0; j < net->Neurons[net->Connections[i].from].connected_num; ++j) {
                        if (outputs[net->Neurons[net->Connections[i].from].incoming_connections[j]] == 0.0f) {
                            ehezowski = false;
                        }
                    }

                    if (ehezowski) {
                        if (outputs[net->Connections[i].from] == 0.0f) {
                            outputs[net->Connections[i].from] = activationFunction(values[net->Connections[i].from] + net->Neurons[net->Connections[i].from].bias);
                        }

                        values[net->Connections[i].to] += net->Connections[i].weight * values[net->Connections[i].from];
                        completed[i] = 1;
                    }
                }                
            }
        }
    }

    for (int i = 0; i < net->num_neurons; ++i) {
        if (net->Neurons[i].type == 2) {
            outputs[i] = activationFunction(values[i] + net->Neurons[i].bias);
        }
    }

    float biggest = 0.0f;

    for (int index = 0; index < 3; ++index) {
        int val = output_ids[index];

        biggest = (outputs[val] > biggest) ? outputs[val] : biggest;
        *result_id = (outputs[val] > biggest) ? index : *result_id;
    } 

    delete[] completed;
    delete[] values;
    delete[] outputs;
}

__global__ void EvaluateNetworks(float* __restrict__ rollingdata, Network* __restrict__ d_networks, int pop_num, int input_num, int output_num) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    if (idx >= pop_num) return;

    float* first_390_data = new float[input_num];
        
    for (int i = 0; i < input_num; ++i) {
        first_390_data[i] = rollingdata[i];
    } 

    Network* net = &d_networks[idx];
    int result_id = -1;

    if (net == nullptr || net->output_ids == nullptr || net->Connections == nullptr) {
        printf("Error: Null pointer in GetNetworkOutput\n");
        return;
    }

    GetNetworkOutput(first_390_data, net, net->output_ids, &result_id);

    printf("%i\n", result_id);
}

It’s probably something like wrong pointer call but after hours of trying to solve the issue, couldn’t find the issue with the code

striker159 · April 3, 2025, 4:31pm

When you compile your code with -lineinfo, compute-sanitizer should give you the file and line number where the error occurs.

However, even without line information the output is really helpful. It tells you exactly why the error occurs. You try to write to pointer, but the pointer is null.

Everytime you use dynamic allocation in device code, you need to check if the returned pointer is not nullptr. I do not see such checks in your code. Nullptr indicates allocation error, for example, out of memory.
You should also keep in mind that the device heap is small by default and a larger size needs to be set explicitly.

Personally, I would try to allocate the required device memory from the host instead of relying on device-side allocation. Judging from your code, all required sizes are known at kernel launch and are not computed dynamically on the device.

h1ugroon · April 3, 2025, 4:47pm

Hello, thank you for your reply!

I tried to debug the pointers to check if they are nullptr and yes, you are right, pointers are not being allocated. But wondering the reason

    float* outputs = new float[net->num_neurons];
    float* values = new float[net->num_neurons];
    int* completed = new int[net->num_connections];

    if (outputs == nullptr || values == nullptr || completed == nullptr) {
        printf("Memory allocation failed\n");
        return;
    }

Also tried to allocate memory via cudaMalloc

    float* outputs;
    float* values;
    int* completed;

    cudaMalloc((void**)&outputs, net->num_neurons * sizeof(float));
    cudaMalloc((void**)&values, net->num_neurons * sizeof(float));
    cudaMalloc((void**)&completed, net->num_connections * sizeof(int));

    if (outputs == nullptr || values == nullptr || completed == nullptr) {
        printf("Memory allocation failed\n");
        return;
    }

And output is

Memory allocation failed
Memory allocation failed
Memory allocation failed
Memory allocation failed
Memory allocation failed
...

How should I allocate memory and what’s actually wrong with that implementation?

h1ugroon · April 3, 2025, 5:01pm

Okay default memory was not enough. Used cudaDeviceSetLimit(cudaLimitMallocHeapSize, 1024 * 1024 * 1024); to set heap memory to 1 GB and now allocations are working perfectly

Topic		Replies	Views
CUDA_ERROR_ILLEGAL_ADDRESS CUDA Programming and Performance	6	10934	September 26, 2017
Cuda unknown error on the shaded line.can anyone help? CUDA Programming and Performance	4	1264	March 12, 2015
Tracking Invalid read size and illegal memory access CUDA Programming and Performance	3	7696	May 24, 2016
CUDA Address out of bounds error - help! CUDA Programming and Performance	0	838	August 8, 2019
How to check GPU kernel's error? CUDA Programming and Performance	7	4762	January 22, 2019
Address out of bounds error - help! General Topics and Other SDKs	0	1028	August 8, 2019
How to interpret cudaMemCheck output of access violation? CUDA Programming and Performance	7	854	September 7, 2017
illegal memory access - any help appreciated CUDA Programming and Performance	5	6709	February 8, 2018
Transparent inter-GPU memory migration CUDA Programming and Performance cuda	4	317	December 14, 2023
an illegal memory access was encountered CUDA Programming and Performance	7	64242	November 10, 2017

CUDA error: an illegal memory access was encountered (Address 0x0 is out of bounds)

Related topics