Hello all,
I’ve come across a very peculiar issue, that can be reproduced with a trivial application:
#include <cuda_runtime_api.h>
#include <exception>
#include <sstream>
#include <thread>
#define CUDA_SAFE_CALL(err) __cudaSafeCall(err, __FILE__, __LINE__, __func__)
inline void __cudaSafeCall(cudaError err,
const char *file, const int line, const char *func)
{
if (err == cudaSuccess) err = cudaDeviceSynchronize();
if (cudaSuccess != err) {
std::stringstream errmsg;
errmsg << file << "(" << line << ") : in " << func
<< "() @ thread 0x" << std::this_thread::get_id() << " : ";
errmsg << " runtime API error " << err << " : " << cudaGetErrorString(err);
throw std::runtime_error(errmsg.str());
}
}
void this_thread()
{
float *ptr = NULL;
size_t mem = 1024*1024*sizeof(*ptr);
CUDA_SAFE_CALL(cudaMalloc((void**)&ptr, mem));
CUDA_SAFE_CALL(cudaMemset(ptr, -1, mem));
CUDA_SAFE_CALL(cudaFree(ptr));
}
int main()
{
#if 0
auto t = new std::thread(this_thread);
t->join();
#else
this_thread();
#endif
}
This code runs without any issues, but fails for me when running under nvprof, with:
==208150== NVPROF is profiling process 208150, command: ./nvprof-fail
terminate called after throwing an instance of 'std::runtime_error'
what(): nvprof-fail.cu(27) : in this_thread() @ thread 0x140460196656896 : runtime API error 700 : an illegal memory access was encountered
==208150== Profiling application: ./nvprof-fail
==208150== Profiling result:
No kernels were profiled.
No API activities were profiled.
==208150== Warning: Some profiling data are not recorded. Make sure cudaProfilerStop() or cuProfilerStop() is called before application exit to flush profile data.
======== Error: Application received signal 6
The dmesg shows the error as being an attempt at dereferencing a null pointer, I think:
[ +0.000003] NVRM: Xid (PCI:0000:01:00): 31, pid=208141, Ch 00000010, intr 00000000. MMU Fault: ENGINE GRAPHICS GPCCLIENT_T1_0 faulted @ 0x0_00000000. Fault is of type FAULT_PDE ACCESS_TYPE_VIRT_WRITE
even though the value of ptr is valid. (The reason why the code has a variant with allocations happening on a separate thread is that I originally thought this was the issue, but apparently it also fails on the same thread.)
I’m running on a Debian unstable with the CUDA toolkit installed via distro packages (10.1.168-3), driver version 430.64 on kernel version 5.3.9 (both from distro packages). The hardware is a GTX1650 on a Dell XPS 15 7590: 01:00.0 3D controller [0302]: NVIDIA Corporation Device [10de:1f91] (rev a1).
(I do not see this issue on another Debian which is still on toolkit version 9.2.148 with driver version 410.93 and kernel version 4.19.16 using a 1080.)
Is there any additional information I can provide to help pinpoint the issue?