CUDA error: an illegal memory access was encountered (Address 0x0 is out of bounds)

I’m trying to build a forward function for my NEAT algorithm but compiler is giving an illegal memory access was encountered error. I tried to mem-check but there is nothing helpful

Output;

========= Invalid __global__ write of size 4 bytes
=========     at EvaluateNetworks(float *, Network *, int, int, int)+0x7c0
=========     by thread (96,0,0) in block (0,0,0)
=========     Address 0x0 is out of bounds
=========     and is 47.248.900.096 bytes before the nearest allocation at 0xb00410000 of size 65.536 bytes
=========     Saved host backtrace up to driver entry point at kernel launch time
=========         Host Frame: cuEventRecordWithFlags [0x7ff9aa88fbc6] in nvcuda64.dll
=========         Host Frame:  [0x16468] in a.exe
=========         Host Frame:  [0x16326] in a.exe
=========         Host Frame:  [0x16f61] in a.exe
=========         Host Frame:  [0x650b] in a.exe
=========         Host Frame:  [0x5829] in a.exe
=========         Host Frame:  [0x4c29] in a.exe
=========         Host Frame:  [0x5000] in a.exe
=========         Host Frame:  [0x2aa9c] in a.exe
=========         Host Frame: BaseThreadInitThunk [0x7ff9e7d07374] in KERNEL32.DLL
=========         Host Frame: RtlUserThreadStart [0x7ff9e8c3cc91] in ntdll.dll
=========
========= Program hit cudaErrorLaunchFailure (error 719) due to "unspecified launch failure" on CUDA API call to cudaDeviceSynchronize.
=========     Saved host backtrace up to driver entry point at error
=========         Host Frame: cuProfilerStop [0x7ff9aa9cc345] in nvcuda64.dll
=========         Host Frame:  [0x16a9c] in a.exe
=========         Host Frame:  [0x5006] in a.exe
=========         Host Frame:  [0x2aa9c] in a.exe
=========         Host Frame: BaseThreadInitThunk [0x7ff9e7d07374] in KERNEL32.DLL
=========         Host Frame: RtlUserThreadStart [0x7ff9e8c3cc91] in ntdll.dll
=========
GPUassert: unspecified launch failure a.cu 305

Structs:

struct Connection {
    int innovationid;
    int from;
    int to;
    float weight;
    //int type; // 0 input to hidden, 1 input to output, 2 hidden to hidden, 3 hidden to output
};

struct Neuron {
    int type;  // 0 input, 1 hidden, 2 output
    float input_sum;
    float bias;
    float output = 0.0f;
    int* incoming_connections;
    int id;
    int connected_num;
};

struct Network {
    Connection* Connections; 
    Neuron* Neurons;
    int output_ids[3];
    int num_neurons;
    int num_connections;
    float fitness;
};

Functions:

__device__ void GetNetworkOutput(float* rollingdata, Network* net, int* output_ids, int* result_id) { 
    float* outputs = new float[net->num_neurons];
    float* values = new float[net->num_neurons];
    int* completed = new int[net->num_connections];

    for (int i = 0; i < net->num_neurons; ++i) {
        if (net->Neurons[i].type == 0) {
            values[i] = rollingdata[i];
            outputs[i] = rollingdata[i];
        }

        else {
            values[i] = 0.0f;
            outputs[i] = 0.0f;
        }
    }

    bool finished = false;

    for (int i = 0; i < net->num_connections; ++i) {
        if (net->Neurons[net->Connections[i].from].type == 0) {
            values[net->Connections[i].to] += net->Connections[i].weight * values[net->Connections[i].from];
            completed[i] = 1;
        }

        else {
            completed[i] = 0;
        }
    }

    while (!finished) {
        finished = true;

        for (int i = 0; i < net->num_connections; ++i) {
            if (completed[i] == 0) {
                finished = false;

                if (values[net->Connections[i].from] != 0.0f) {
                    bool ehezowski = true;

                    for (int j = 0; j < net->Neurons[net->Connections[i].from].connected_num; ++j) {
                        if (outputs[net->Neurons[net->Connections[i].from].incoming_connections[j]] == 0.0f) {
                            ehezowski = false;
                        }
                    }

                    if (ehezowski) {
                        if (outputs[net->Connections[i].from] == 0.0f) {
                            outputs[net->Connections[i].from] = activationFunction(values[net->Connections[i].from] + net->Neurons[net->Connections[i].from].bias);
                        }

                        values[net->Connections[i].to] += net->Connections[i].weight * values[net->Connections[i].from];
                        completed[i] = 1;
                    }
                }                
            }
        }
    }

    for (int i = 0; i < net->num_neurons; ++i) {
        if (net->Neurons[i].type == 2) {
            outputs[i] = activationFunction(values[i] + net->Neurons[i].bias);
        }
    }

    float biggest = 0.0f;

    for (int index = 0; index < 3; ++index) {
        int val = output_ids[index];

        biggest = (outputs[val] > biggest) ? outputs[val] : biggest;
        *result_id = (outputs[val] > biggest) ? index : *result_id;
    } 

    delete[] completed;
    delete[] values;
    delete[] outputs;
}

__global__ void EvaluateNetworks(float* __restrict__ rollingdata, Network* __restrict__ d_networks, int pop_num, int input_num, int output_num) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    if (idx >= pop_num) return;

    float* first_390_data = new float[input_num];
        
    for (int i = 0; i < input_num; ++i) {
        first_390_data[i] = rollingdata[i];
    } 

    Network* net = &d_networks[idx];
    int result_id = -1;

    if (net == nullptr || net->output_ids == nullptr || net->Connections == nullptr) {
        printf("Error: Null pointer in GetNetworkOutput\n");
        return;
    }

    GetNetworkOutput(first_390_data, net, net->output_ids, &result_id);

    printf("%i\n", result_id);
}

It’s probably something like wrong pointer call but after hours of trying to solve the issue, couldn’t find the issue with the code

When you compile your code with -lineinfo, compute-sanitizer should give you the file and line number where the error occurs.

However, even without line information the output is really helpful. It tells you exactly why the error occurs. You try to write to pointer, but the pointer is null.

Everytime you use dynamic allocation in device code, you need to check if the returned pointer is not nullptr. I do not see such checks in your code. Nullptr indicates allocation error, for example, out of memory.
You should also keep in mind that the device heap is small by default and a larger size needs to be set explicitly.

Personally, I would try to allocate the required device memory from the host instead of relying on device-side allocation. Judging from your code, all required sizes are known at kernel launch and are not computed dynamically on the device.

Hello, thank you for your reply!

I tried to debug the pointers to check if they are nullptr and yes, you are right, pointers are not being allocated. But wondering the reason

    float* outputs = new float[net->num_neurons];
    float* values = new float[net->num_neurons];
    int* completed = new int[net->num_connections];

    if (outputs == nullptr || values == nullptr || completed == nullptr) {
        printf("Memory allocation failed\n");
        return;
    }

Also tried to allocate memory via cudaMalloc

    float* outputs;
    float* values;
    int* completed;

    cudaMalloc((void**)&outputs, net->num_neurons * sizeof(float));
    cudaMalloc((void**)&values, net->num_neurons * sizeof(float));
    cudaMalloc((void**)&completed, net->num_connections * sizeof(int));

    if (outputs == nullptr || values == nullptr || completed == nullptr) {
        printf("Memory allocation failed\n");
        return;
    }

And output is

Memory allocation failed
Memory allocation failed
Memory allocation failed
Memory allocation failed
Memory allocation failed
...

How should I allocate memory and what’s actually wrong with that implementation?

Okay default memory was not enough. Used cudaDeviceSetLimit(cudaLimitMallocHeapSize, 1024 * 1024 * 1024); to set heap memory to 1 GB and now allocations are working perfectly