AtomicExch Causing CUDA_ERROR_INVALID_ADDRESS_SPACE in MemCheck

I’m still relatively new to CUDA, though getting more comfortable, but I’ve run into a bug I cannot find a solution to. I tried to switch a regular value assignment, using the equals operator, with atomicExch, but it seems to not assign the value. Running the mem-check tool gives me the following error

========= Program hit CUDA_ERROR_INVALID_ADDRESS_SPACE (error 717) due to "operation not supported on global/shared address space" on CUDA API call to cuModuleUnload.
=========     Saved host backtrace up to driver entry point at error
=========     Host Frame:cuModuleLoadFatBinary [0x7ffeb93f9fc4]
=========                in C:\WINDOWS\system32\DriverStore\FileRepository\nvblwi.inf_amd64_fc00febae1a6ee5f\nvcuda64.dll
=========     Host Frame:cudart::module::unload [0x3f785]
=========                in C:\Users\Me\Documents\Uni\MyClassParallel\bin\main.exe
=========     Host Frame:cudart::contextState::unloadAllModules [0x3fd96]
=========                in C:\Users\Me\Documents\Uni\MyClassParallel\bin\main.exe
=========     Host Frame:cudart::contextStateManager::destroyAllContextStatesOnRuntimeUnload [0x41128]
=========                in C:\Users\Me\Documents\Uni\MyClassParallel\bin\main.exe
=========     Host Frame:cudart::globalState::~globalState [0x4305b]
=========                in C:\Users\Me\Documents\Uni\MyClassParallel\bin\main.exe
=========     Host Frame:cudart::globalState::registerVar [0x51c66]
=========                in C:\Users\Me\Documents\Uni\MyClassParallel\bin\main.exe
=========     Host Frame:minkernel\crts\ucrt\src\appcrt\startup\onexit.cpp:206:<lambda_f03950bc5685219e0bcd2087efbe011e>::operator() [0x9cd1f]
=========                in C:\Users\Me\Documents\Uni\MyClassParallel\bin\main.exe
=========     Host Frame:VCCRT\vcruntime\inc\internal_shared.h:204:__crt_seh_guarded_call<int>::operator()<<lambda_7777bce6b2f8c936911f934f8298dc43>,<lambda_f03950bc5685219e0bcd2087efbe011e> &,<lambda_3883c3dff614d5e0c5f61bb1ac94921c> > [0x9c971]
=========                in C:\Users\Me\Documents\Uni\MyClassParallel\bin\main.exe
=========     Host Frame:minkernel\crts\ucrt\src\appcrt\startup\onexit.cpp:231:_execute_onexit_table [0x9ce25]
=========                in C:\Users\Me\Documents\Uni\MyClassParallel\bin\main.exe
=========     Host Frame:minkernel\crts\ucrt\src\appcrt\startup\exit.cpp:232:<lambda_6e4b09c48022b2350581041d5f6b0c4c>::operator() [0x7e9e3]
=========                in C:\Users\Me\Documents\Uni\MyClassParallel\bin\main.exe
=========     Host Frame:VCCRT\vcruntime\inc\internal_shared.h:224:__crt_seh_guarded_call<void>::operator()<<lambda_d80eeec6fff315bfe5c115232f3240e3>,<lambda_6e4b09c48022b2350581041d5f6b0c4c> &,<lambda_2358e3775559c9db80273638284d5e45> > [0x7e8a1]
=========                in C:\Users\Me\Documents\Uni\MyClassParallel\bin\main.exe
=========     Host Frame:minkernel\crts\ucrt\src\appcrt\startup\exit.cpp:278:common_exit [0x7eb47]
=========                in C:\Users\Me\Documents\Uni\MyClassParallel\bin\main.exe
=========     Host Frame:d:\a01\_work\43\s\src\vctools\crt\vcstartup\src\startup\exe_common.inl:295:__scrt_common_main_seh [0x6300b]
=========                in C:\Users\Me\Documents\Uni\MyClassParallel\bin\main.exe
=========     Host Frame:BaseThreadInitThunk [0x7fff87fe7c24]
=========                in C:\WINDOWS\System32\KERNEL32.DLL
=========     Host Frame:RtlUserThreadStart [0x7fff882cd721]
=========                in C:\WINDOWS\SYSTEM32\ntdll.dll
=========

I haven’t been able to figure out where this is coming from, a version which I was using to test the classes is the following:

The main file contains these methods

__device__
void TestEntriesOnDevices(MyClassEntry<addtype, remtype>* entry1, MyClassEntry<addtype, remtype>* entry2) {
    entry1->setR(13456);
    entry2->setR(23);

    entry1->print();
    entry2->print();

    //entry1->exchValue(entry2);

    entry1->print();
    entry2->print();

    return;
}

__global__
void TestEntries() {
    MyClassEntry<addtype, remtype> entry1 = MyClassEntry<addtype, remtype>();
    MyClassEntry<addtype, remtype> entry2 = MyClassEntry<addtype, remtype>();

    TestEntriesOnDevices(&entry1, &entry2);
}

int main(void)
{
    printf("Starting\n");
	//Test(10);
    TestEntries << <1, 1 >> > ();
    cudaDeviceSynchronize();

    return 0;
}

While this is the relevant parts of MyClassEntry.cu

#ifndef ENTRYINCLUDED
#define ENTRYINCLUDED
#include "TableEntry.h"
#endif


template <class ADD, class REM>
class MyClassEntry : TableEntry <ADD, REM> {

private:
    int Rindex[2] = { 1, 56 };

public:
    __host__ __device__
    MyClassEntry(ADD R, int H, bool O) {
        val = 0;
        setR(R);
    }

    __host__ __device__
    MyClassEntry() {
        val = 0;
    }


    __host__ __device__
    void setR(REM x) {
        setBits(Rindex[0], Rindex[1], x);
    }

    __host__ __device__
    REM getR() {
        return (REM)getBits(Rindex[0], Rindex[1]);
    }

};

and the relevant parts of the TableEntry.h file

#include <utility>
#include <iostream>
#include <bitset>
#include <inttypes.h>


template <class ADD, class REM>
class TableEntry {

protected:
    uint64_t val;

    __host__ __device__
    void setBits(int start, int end, uint64_t ins) {
        printf("\tIns %" PRIu64 "\n", ins);
        uint64_t mask = ((((uint64_t)1) << end) - 1) ^ ((((uint64_t)1) << (start - 1)) - 1);
        uint64_t tempval = val & ~mask;      //Remove all of the bits currently in the positions
        printf("\tTempval %" PRIu64 "\n", tempval);
        ins = ins << (start - 1);   //Shift new val to correct position
        ins = ins & mask;       //Mask the new val to prevent overflow
        uint64_t newval = tempval | ins;        //Place the new val
        printf("\tnewval %" PRIu64 "\n", newval);
        //In devices, atomically exchange
        #ifdef  __CUDA_ARCH__
        printf("\tAtomic\n");
        atomicExch(&val, newval);
        printf("\tVal after %" PRIu64 "\n", val);
        #else
        val = newval;
        #endif
    }
}

I am relatively sure it’s the atomicExch causing the issue, since commenting it out causes the mem-check to not raise any concerns. Any help would be appreciated!

atomicExch cannot operate on variables in registers.

An atomic function performs a read-modify-write atomic operation on one 32-bit or 64-bit word residing in global or shared memory.

1 Like

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.