I’m still relatively new to CUDA, though getting more comfortable, but I’ve run into a bug I cannot find a solution to. I tried to switch a regular value assignment, using the equals operator, with atomicExch, but it seems to not assign the value. Running the mem-check tool gives me the following error
========= Program hit CUDA_ERROR_INVALID_ADDRESS_SPACE (error 717) due to "operation not supported on global/shared address space" on CUDA API call to cuModuleUnload.
========= Saved host backtrace up to driver entry point at error
========= Host Frame:cuModuleLoadFatBinary [0x7ffeb93f9fc4]
========= in C:\WINDOWS\system32\DriverStore\FileRepository\nvblwi.inf_amd64_fc00febae1a6ee5f\nvcuda64.dll
========= Host Frame:cudart::module::unload [0x3f785]
========= in C:\Users\Me\Documents\Uni\MyClassParallel\bin\main.exe
========= Host Frame:cudart::contextState::unloadAllModules [0x3fd96]
========= in C:\Users\Me\Documents\Uni\MyClassParallel\bin\main.exe
========= Host Frame:cudart::contextStateManager::destroyAllContextStatesOnRuntimeUnload [0x41128]
========= in C:\Users\Me\Documents\Uni\MyClassParallel\bin\main.exe
========= Host Frame:cudart::globalState::~globalState [0x4305b]
========= in C:\Users\Me\Documents\Uni\MyClassParallel\bin\main.exe
========= Host Frame:cudart::globalState::registerVar [0x51c66]
========= in C:\Users\Me\Documents\Uni\MyClassParallel\bin\main.exe
========= Host Frame:minkernel\crts\ucrt\src\appcrt\startup\onexit.cpp:206:<lambda_f03950bc5685219e0bcd2087efbe011e>::operator() [0x9cd1f]
========= in C:\Users\Me\Documents\Uni\MyClassParallel\bin\main.exe
========= Host Frame:VCCRT\vcruntime\inc\internal_shared.h:204:__crt_seh_guarded_call<int>::operator()<<lambda_7777bce6b2f8c936911f934f8298dc43>,<lambda_f03950bc5685219e0bcd2087efbe011e> &,<lambda_3883c3dff614d5e0c5f61bb1ac94921c> > [0x9c971]
========= in C:\Users\Me\Documents\Uni\MyClassParallel\bin\main.exe
========= Host Frame:minkernel\crts\ucrt\src\appcrt\startup\onexit.cpp:231:_execute_onexit_table [0x9ce25]
========= in C:\Users\Me\Documents\Uni\MyClassParallel\bin\main.exe
========= Host Frame:minkernel\crts\ucrt\src\appcrt\startup\exit.cpp:232:<lambda_6e4b09c48022b2350581041d5f6b0c4c>::operator() [0x7e9e3]
========= in C:\Users\Me\Documents\Uni\MyClassParallel\bin\main.exe
========= Host Frame:VCCRT\vcruntime\inc\internal_shared.h:224:__crt_seh_guarded_call<void>::operator()<<lambda_d80eeec6fff315bfe5c115232f3240e3>,<lambda_6e4b09c48022b2350581041d5f6b0c4c> &,<lambda_2358e3775559c9db80273638284d5e45> > [0x7e8a1]
========= in C:\Users\Me\Documents\Uni\MyClassParallel\bin\main.exe
========= Host Frame:minkernel\crts\ucrt\src\appcrt\startup\exit.cpp:278:common_exit [0x7eb47]
========= in C:\Users\Me\Documents\Uni\MyClassParallel\bin\main.exe
========= Host Frame:d:\a01\_work\43\s\src\vctools\crt\vcstartup\src\startup\exe_common.inl:295:__scrt_common_main_seh [0x6300b]
========= in C:\Users\Me\Documents\Uni\MyClassParallel\bin\main.exe
========= Host Frame:BaseThreadInitThunk [0x7fff87fe7c24]
========= in C:\WINDOWS\System32\KERNEL32.DLL
========= Host Frame:RtlUserThreadStart [0x7fff882cd721]
========= in C:\WINDOWS\SYSTEM32\ntdll.dll
=========
I haven’t been able to figure out where this is coming from, a version which I was using to test the classes is the following:
The main file contains these methods
__device__
void TestEntriesOnDevices(MyClassEntry<addtype, remtype>* entry1, MyClassEntry<addtype, remtype>* entry2) {
entry1->setR(13456);
entry2->setR(23);
entry1->print();
entry2->print();
//entry1->exchValue(entry2);
entry1->print();
entry2->print();
return;
}
__global__
void TestEntries() {
MyClassEntry<addtype, remtype> entry1 = MyClassEntry<addtype, remtype>();
MyClassEntry<addtype, remtype> entry2 = MyClassEntry<addtype, remtype>();
TestEntriesOnDevices(&entry1, &entry2);
}
int main(void)
{
printf("Starting\n");
//Test(10);
TestEntries << <1, 1 >> > ();
cudaDeviceSynchronize();
return 0;
}
While this is the relevant parts of MyClassEntry.cu
#ifndef ENTRYINCLUDED
#define ENTRYINCLUDED
#include "TableEntry.h"
#endif
template <class ADD, class REM>
class MyClassEntry : TableEntry <ADD, REM> {
private:
int Rindex[2] = { 1, 56 };
public:
__host__ __device__
MyClassEntry(ADD R, int H, bool O) {
val = 0;
setR(R);
}
__host__ __device__
MyClassEntry() {
val = 0;
}
__host__ __device__
void setR(REM x) {
setBits(Rindex[0], Rindex[1], x);
}
__host__ __device__
REM getR() {
return (REM)getBits(Rindex[0], Rindex[1]);
}
};
and the relevant parts of the TableEntry.h file
#include <utility>
#include <iostream>
#include <bitset>
#include <inttypes.h>
template <class ADD, class REM>
class TableEntry {
protected:
uint64_t val;
__host__ __device__
void setBits(int start, int end, uint64_t ins) {
printf("\tIns %" PRIu64 "\n", ins);
uint64_t mask = ((((uint64_t)1) << end) - 1) ^ ((((uint64_t)1) << (start - 1)) - 1);
uint64_t tempval = val & ~mask; //Remove all of the bits currently in the positions
printf("\tTempval %" PRIu64 "\n", tempval);
ins = ins << (start - 1); //Shift new val to correct position
ins = ins & mask; //Mask the new val to prevent overflow
uint64_t newval = tempval | ins; //Place the new val
printf("\tnewval %" PRIu64 "\n", newval);
//In devices, atomically exchange
#ifdef __CUDA_ARCH__
printf("\tAtomic\n");
atomicExch(&val, newval);
printf("\tVal after %" PRIu64 "\n", val);
#else
val = newval;
#endif
}
}
I am relatively sure it’s the atomicExch causing the issue, since commenting it out causes the mem-check to not raise any concerns. Any help would be appreciated!