I’ve been working on developing a framework for use of the persistent threads model of programming. This approach is using zero-copy pinned memory for communication between CPU and CUDA, and is running on the Jetson TX2. I’ve run into some issues with memory consistency between the CPU and GPU, in the formal sense of memory consistency.
The Jade Alglave, et. al., “GPU concurrency: Weak behaviours and programming assumptions” paper clarifies that the CUDA environment itself has a weak memory ordering model, and uses thread fences (or membar instructions) to deal with it.
But this case goes beyond that by sharing the memory buffers between CPU and GPU. So far, I have been unable to arrange thread fences & memory barriers to ensure memory consistency 100% of the time, and I’m not even entirely sure it’s possible. If so, this seems like a hardware bug to me. (Although I’d be thrilled to be proven wrong!)
The gist is this: There are two variables, Value and Lock. (Don’t get hung up on the name “Lock”. There is no actual contention for this variable. It just is used to tell the CPU when a *Value is ready.) Imagine the values start as 0.
On the GPU there is only one thread that interacts with these variables. It does this:
GPU: *Value = val;
GPU: threadfence_system();
GPU: *Lock = val;
Meanwhile,
CPU: while (*Lock != val) {}
CPU: asm volatile(“dmb sy”);
CPU: result = *Value;
CPU: assert(result == val);
And there are similar variables and similar code where the above takes place in reverse. That is, the CPU modifies the values and the GPU waits for and reads the results.
Once in a while, the CPU will see that *Lock has been set to val, but the *Value will remain the stale previous value.
I am including an example program here in-line. It performs the above logic. It also has a number of extraneous GPU threads which run pretty much independently reading and writing quasi-random cells from an Unrelated zero-copy pinned memory buffer. This is just to stress the CUDA memory system and make timing problems more likely. When running this, I also run the opensource stress program from stress-1.0.4, available using apt-get, as follows: stress -c 5 -m 5 -i 5 -d 5. This also is intended to stress the memory system, but from the CPU side.
Note that this usually takes 10s of 100s of millions of iterations before detecting a failure. Depending on how aggressive I got with (seemingly) unnecessary extra fences, I’ve seen it take billions of iterations before a failure.
I’ll appreciate any comments.
#include <stdlib.h>
#include <stdio.h>
#include <errno.h>
#include <limits.h>
#include <signal.h>
#include <cpuset.h>
#include <sched.h>
#include <sys/mman.h>
#include <driver_types.h>
#include <cuda_runtime_api.h>
//------------------------------------------------------------------------------
#define CUCHK(call) \
do { \
cudaError_t err = call; \
if (err != cudaSuccess) { \
fprintf(stderr, "%s(%i): error: %s (%d)\n", \
__FILE__, __LINE__, cudaGetErrorString(err), err); \
exit(1); \
} \
} while(0)
#define CUCHK_LAUNCH() \
do { \
cudaError_t err = cudaGetLastError(); \
if (err != cudaSuccess) { \
fprintf(stderr, "%s(%i): kernel launch error: %s (%d)\n", \
__FILE__, __LINE__, cudaGetErrorString(err), err); \
exit(1); \
} \
} while(0)
//------------------------------------------------------------------------------
static void bind(int cpu)
{
cpu_set_t mask;
CPU_ZERO(&mask);
CPU_SET(cpu, &mask);
if (sched_setaffinity(0, sizeof(mask), &mask) == -1) {
printf("sched_setaffinity (CPU %u) failed; errno=%d (%s)\n",
cpu, errno, strerror(errno));
}
}
static void set_priority(int policy,
int priority)
{
struct sched_param sparms;
sparms.sched_priority = priority;
if (sched_setscheduler(0, policy, &sparms) == -1) {
printf("sched_setscheduler (%u,%u) failed; errno=%d (%s)\n",
policy, priority, errno, strerror(errno));
}
}
static void shield_cpu(int cpu,
const char* filename)
{
FILE* file = fopen(filename, "r+");
if (file == NULL) {
printf("open(\"%s\") failed; errno=%d (%s)\n",
filename, errno, strerror(errno));
return;
}
unsigned int mask;
fscanf(file, "%x\n", &mask);
mask |= (1 << cpu);
fprintf(file, "%x\n", mask);
fflush(file);
fclose(file);
}
static void shield_cpu_all (int cpu)
{
shield_cpu(cpu, "/proc/shield/procs");
shield_cpu(cpu, "/proc/shield/irqs");
shield_cpu(cpu, "/proc/shield/ltmrs");
}
static void real_time_setup (bool do_mlockall = true)
{
if (do_mlockall) {
mlockall(MCL_CURRENT | MCL_FUTURE);
}
bind(3);
shield_cpu_all(3);
set_priority(SCHED_FIFO, 80);
}
//------------------------------------------------------------------------------
namespace handlers {
static bool dump = false;
static bool done = false;
static void SIGQUIT_handler (int signo)
{
dump = true;
}
static void SIGINT_handler (int signo)
{
done = true;
}
static void SIGTERM_handler (int signo)
{
done = true;
}
static void setup ()
{
using namespace handlers;
{
struct sigaction act;
sigemptyset(&act.sa_mask);
act.sa_handler = SIGQUIT_handler;
act.sa_flags = 0;
sigaction(SIGQUIT, &act, NULL);
}
{
struct sigaction act;
sigemptyset(&act.sa_mask);
act.sa_handler = SIGINT_handler;
act.sa_flags = 0;
sigaction(SIGINT, &act, NULL);
}
{
struct sigaction act;
sigemptyset(&act.sa_mask);
act.sa_handler = SIGTERM_handler;
act.sa_flags = 0;
sigaction(SIGTERM, &act, NULL);
}
}
}
//------------------------------------------------------------------------------
#define UNRELATED_N (1024 * 1024) // must be a power of 2
__device__ unsigned int hash (unsigned int x)
{
x = ((x >> 16) ^ x) * 0x45d9f3b;
x = ((x >> 16) ^ x) * 0x45d9f3b;
x = (x >> 16) ^ x;
return x;
}
__global__ void persist(volatile int* ToCpuValue,
volatile int* ToCpuLock,
volatile int* ToCudaValue,
volatile int* ToCudaLock,
volatile int* CudaTerminated,
int* Unrelated)
{
if (threadIdx.x == 0 && blockIdx.x == 0) {
for (long long test = 1;; test++) {
int val = test & 0x7fffffff;
// -> CPU
*ToCpuValue = val;
__threadfence_system();
*ToCpuLock = val;
// -> CUDA
while (*ToCudaLock != val) {
if (*CudaTerminated) {
printf("CUDA terminating on request on test %lld\n", test);
goto bailout;
}
}
__threadfence_system();
int toCudaValue = *ToCudaValue;
if (toCudaValue != val) {
printf("Failure detected on CUDA side on test %lld:\n", test);
printf(" ToCpuValue = %d\n"
" ToCpuLock = %d\n"
" ToCudaValue = %d <- WRONG\n"
" ToCudaLock = %d\n",
*ToCpuValue, *ToCpuLock, toCudaValue, *ToCudaLock);
goto bailout;
}
}
bailout:
*CudaTerminated = 1;
} else {
// In the other threads, stress the GPU memory system with quasi-random
// reads & writes using Unrelated[].
for (unsigned int test = 0; !*CudaTerminated; test++) {
unsigned int readIdx = test + blockIdx.x * blockDim.x + threadIdx.x;
unsigned int writeIdx = readIdx + gridDim.x * blockDim.x;
unsigned int readHash = hash(readIdx) & (UNRELATED_N - 1);
unsigned int writeHash = hash(writeIdx) & (UNRELATED_N - 1);
Unrelated[writeHash] = Unrelated[readHash] + 1;
test++;
}
}
}
//------------------------------------------------------------------------------
static void perform_cuda ()
{
volatile int* h_ToCpuValue;
volatile int* h_ToCpuLock;
volatile int* h_ToCudaValue;
volatile int* h_ToCudaLock;
volatile int* h_CudaTerminated;
int* h_Unrelated;
CUCHK(cudaHostAlloc((void**)&h_ToCpuValue, 1024 * sizeof(int),
cudaHostAllocMapped));
CUCHK(cudaHostAlloc((void**)&h_ToCpuLock, 1024 * sizeof(int),
cudaHostAllocMapped));
CUCHK(cudaHostAlloc((void**)&h_ToCudaValue, 1024 * sizeof(int),
cudaHostAllocMapped));
CUCHK(cudaHostAlloc((void**)&h_ToCudaLock, 1024 * sizeof(int),
cudaHostAllocMapped));
CUCHK(cudaHostAlloc((void**)&h_CudaTerminated, sizeof(int),
cudaHostAllocMapped));
CUCHK(cudaHostAlloc((void**)&h_Unrelated, UNRELATED_N * sizeof(int),
cudaHostAllocMapped));
*h_ToCpuValue = 0;
*h_ToCpuLock = 0;
*h_ToCudaValue = 0;
*h_ToCudaLock = 0;
*h_CudaTerminated = 0;
int* d_ToCpuValue;
int* d_ToCpuLock;
int* d_ToCudaValue;
int* d_ToCudaLock;
int* d_CudaTerminated;
int* d_Unrelated;
CUCHK(cudaHostGetDevicePointer(&d_ToCpuValue, (int*)h_ToCpuValue, 0));
CUCHK(cudaHostGetDevicePointer(&d_ToCpuLock, (int*)h_ToCpuLock, 0));
CUCHK(cudaHostGetDevicePointer(&d_ToCudaValue, (int*)h_ToCudaValue, 0));
CUCHK(cudaHostGetDevicePointer(&d_ToCudaLock, (int*)h_ToCudaLock, 0));
CUCHK(cudaHostGetDevicePointer(&d_CudaTerminated, (int*)h_CudaTerminated, 0));
CUCHK(cudaHostGetDevicePointer(&d_Unrelated, (int*)h_Unrelated, 0));
persist<<<2, 64>>>(d_ToCpuValue, d_ToCpuLock, d_ToCudaValue, d_ToCudaLock,
d_CudaTerminated, d_Unrelated);
CUCHK_LAUNCH();
for (long long test = 1;; test++) {
int val = test & 0x7fffffff;
// -> CPU
while (*h_ToCpuLock != val) {
if (*h_CudaTerminated) {
printf("CPU terminating on request on test %lld\n", test);
goto bailout;
}
}
asm volatile("dmb sy"); // seems like overkill
int toCpuValue = *h_ToCpuValue;
if (toCpuValue != val) {
printf("Failure detected on CPU side on test %lld:\n", test);
printf(" ToCpuValue = %d <- wrong\n"
" ToCpuLock = %d\n"
" ToCudaValue = %d\n"
" ToCudaLock = %d\n",
toCpuValue, *h_ToCpuLock, *h_ToCudaValue, *h_ToCudaLock);
}
// -> CUDA
*h_ToCudaValue = val;
asm volatile("dmb sy"); // seems like overkill
*h_ToCudaLock = val;
if (handlers::done) {
printf("test = %lld\n", test);
break;
}
if (handlers::dump) {
printf("test = %lld\n", test);
handlers::dump = false;
}
}
bailout:
*h_CudaTerminated = 1;
CUCHK(cudaDeviceSynchronize());
CUCHK(cudaFreeHost((int*)h_ToCpuValue));
CUCHK(cudaFreeHost((int*)h_ToCpuLock));
CUCHK(cudaFreeHost((int*)h_ToCudaValue));
CUCHK(cudaFreeHost((int*)h_ToCudaLock));
CUCHK(cudaFreeHost((void*)h_CudaTerminated));
CUCHK(cudaFreeHost(h_Unrelated));
}
int main (int argc,
char* argv[])
{
real_time_setup();
CUCHK(cudaSetDeviceFlags(cudaDeviceMapHost));
handlers::setup();
perform_cuda();
return 0;
}