I am seeing what I think is odd CUDA behaviour on a Jetson Nano. This is on the latest R32.3.1 image but has the same results the previous release as well. It is a very simple application that allocates managed memory in the main thread while executing an empty CUDA function in a separate thread. These two tasks are not related and do not share any information. I feel the two tasks should be able to run in parallel.
Here is the code:
main.cpp:
#include <unistd.h>
#include <future>
#include "TestCuda.cuh"
std::mutex m;
// This works fine with a mutex, but crashes with a sigbus error when not using a mutex
//#define USE_MUTEX
struct MyThread {
void run() {
int threadLoop = 0;
while(1) {
#ifdef USE_MUTEX
m.lock();
#endif
printf("Thread Run (loop %d)\n", threadLoop++);
// run kernel
testCuda();
#ifdef USE_MUTEX
m.unlock();
#endif
usleep(0);
}
}
};
int main(int argc, char** argv) {
MyThread thread;
auto threadFuture = std::async(std::launch::async, &MyThread::run, thread);
int loop = 0;
while(1){
#ifdef USE_MUTEX
m.lock();
#endif
int* temp = nullptr;
printf("*** Main Allocating (loop = %d)\n", loop++);
cudaError_t err = cudaMallocManaged(&temp, sizeof(int)); // yes, this is a memory leak, but we keep the code simple to demonstrate the issue
if (err != cudaSuccess) {
printf("Failed to cudaMallocManaged()\n");
return -1;
}
*temp = 0; // <-- SIGBUS occurs here if don't use a mutex
printf("*** Main Finished Allocating value: %d\n", *temp);
#ifdef USE_MUTEX
m.unlock();
#endif
usleep(0);
}
}
TestCuda.cuh:
#include <cuda.h>
#include <cuda_runtime.h>
__global__ void testKernel();
extern "C" {
void testCuda();
}
TestCuda.cu
#include "TestCuda.cuh"
#include <stdio.h>
__global__ void testKernel() {
printf ("Kernel running\n");
}
void testCuda() {
testKernel<<<1,1>>>();
cudaError_t err = cudaDeviceSynchronize();
if (err != cudaSuccess) {
printf("SYNC FAILED\n\n\n");
}
}
When we run with a mutex, i.e. in main.cpp
#define USE_MUTEX
it works fine, as one would expect and runs forever:
*** Main Allocating (loop = 0)
*** Main Finished Allocating value: 0
Thread Run (loop 0)
Thread Kernel running
*** Main Allocating (loop = 1)
*** Main Finished Allocating value: 0
Thread Run (loop 1)
Thread Kernel running
*** Main Allocating (loop = 2)
*** Main Finished Allocating value: 0
*** Main Allocating (loop = 3)
*** Main Finished Allocating value: 0
*** Main Allocating (loop = 4)
*** Main Finished Allocating value: 0
*** Main Allocating (loop = 5)
*** Main Finished Allocating value: 0
Thread Run (loop 2)
Thread Kernel running
*** Main Allocating (loop = 6)
*** Main Finished Allocating value: 0
Thread Run (loop 3)
Thread Kernel running
*** Main Allocating (loop = 7)
*** Main Finished Allocating value: 0
...
If we comment out the mutex:
//#define USE_MUTEX
and run in parallel we get an arbitrarily timed failure (generally quickly) when trying to dereference this in the main thread:
*temp = 0; // <-- SIGBUS occurs here if don't use a mutex
i.e.
*** Main Allocating (loop = 0)
Thread Run (loop 0)
*** Main Finished Allocating value: 0
*** Main Allocating (loop = 1)
Bus error
Perhaps I missed something simple? Is there a reason why I cannot allocate memory in one thread while executing an independent kernel (that does not touch this memory) in another?