I have been experiencing a segfault or ‘bus error’ on a Jetson Xavier AGX. I have a pretty complicated application, but I managed to distill the crash as being caused by the following access pattern:
There are at least two threads, each with there own private managed memory buffers. They write to their respective buffers under both cpu and gpu code. This is code reproduces the error. It does not crash on my desktop (GTX 1070 Ti), but does on my Xavier AGX
#include <cuda_runtime.h>
#include <thread>
#include <mutex>
#include <unistd.h>
#include <cstring>
#include <cassert>
#define getLastCudaError(msg) __getLastCudaError(msg, __FILE__, __LINE__)
inline void __getLastCudaError(const char *errorMessage, const char *file, const int line) {
cudaError_t err = cudaGetLastError();
if (cudaSuccess != err) {
fprintf(stderr, "%s(%i) : getLastCudaError() CUDA error : %s : (%d) %s.\n", file, line, errorMessage, (int)err, cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
}
namespace {
const size_t bufLen = 2048;
// If this mutex is locked, there is no crash!
std::mutex mtx;
// The only difference between loop1 and loop2, is that the second
// uses cudaMemset.
void loop1() {
float *dataBlock2 = nullptr;
cudaMallocManaged(&dataBlock2, sizeof(float)*bufLen);
for (int i=0; ; i++) {
{
//std::lock_guard<std::mutex> lck(mtx);
for (int i=0; i<bufLen; i++) dataBlock2[i] += 1.0f;
cudaDeviceSynchronize();
getLastCudaError("post warp");
}
usleep(10'000);
if (i % 10 == 0) printf(" - fast : %d\n", i);
}
cudaFree(dataBlock2);
}
void loop2() {
float *dataBlock2 = nullptr;
cudaMallocManaged(&dataBlock2, sizeof(float)*bufLen);
for (int i=0; ; i++) {
{
//std::lock_guard<std::mutex> lck(mtx);
for (int i=0; i<bufLen; i++) dataBlock2[i] += 1.0f;
cudaMemset(dataBlock2, 0, sizeof(float)*bufLen/2);
cudaDeviceSynchronize();
getLastCudaError("post warp");
}
usleep(10'000);
if (i % 10 == 0) printf(" - fast : %d\n", i);
}
cudaFree(dataBlock2);
}
}
int main() {
//
// This does not crash. It requires using cudaMemset in at least one thread.
// std::thread thread1(&loop1), thread2(&loop1);
std::thread thread1(&loop1), thread2(&loop2);
thread1.join();
thread2.join();
return 0;
}
As noted in a comment, the crash only happens when cudaMemset is called in one or both of the threads. Sometimes the exception is a segfault, sometimes a bus error.
Also, if I force the writes to be synchronous by locking with a mutex, there is no crash. Uncomment the locks to see the program work.
I expect that any memory allocated using cudaMallocManaged()
is paged to the device needed, but the issue seems to be stemming from such a transfer, and somehow only when accessing data from different threads. Am I missing a piece of documentation that tells me not to use managed memory in different threads, even if the buffers don’t overlap?