Segmentation Fault when using UMA and pthreads

Here is an example of using stream attach to solve the seg fault problem. Note that this simple example only demonstrates the case where each pthread is using an independent set of managed allocations. If managed allocations are shared between two or more pthreads, you cannot use this method.

In this case, I am running on a cc3.5 device, which is in the same UM regime as windows and jetson: concurrent managed access is not supported.

$ cat t2219.cu
#include <pthread.h>
#include <iostream>

using mt = int;
const int num_loops = 1024;
const int nBLK = 2;
const int nTPB = 128;
const int num_pthreads = 4;
const int ds = 1048576;

__global__ void k(mt *data, size_t N){

  for (size_t idx = blockIdx.x*blockDim.x+threadIdx.x; idx < N; idx += gridDim.x*blockDim.x)
    data[idx]++;
}

struct threadInfo
{
    mt *data;
    size_t my_N;
    cudaStream_t s;
    mt test;
};

void *threadFunc(void* arg)
{
    struct threadInfo* threadInfoStruct;
    threadInfoStruct = (struct threadInfo*) arg;
    for (int i = 0; i < num_loops; i++){
      k<<<nBLK, nTPB, 0, threadInfoStruct->s>>>(threadInfoStruct->data, threadInfoStruct->my_N);
      cudaStreamSynchronize(threadInfoStruct->s);
      threadInfoStruct->test = (threadInfoStruct->data)[0];}
    pthread_exit(NULL);
}


int main() {

  int *data[num_pthreads];
  cudaStream_t str[num_pthreads];
  for (int i = 0; i < num_pthreads; i++){
    cudaMallocManaged(data+i, sizeof(mt)*ds);
    for (int j = 0; j < ds; j++) data[i][j] = 0;
    cudaStreamCreate(str+i);
#ifdef USE_STREAM_ATTACH
    cudaStreamAttachMemAsync(str[i], data[i], 0, cudaMemAttachSingle);
    cudaStreamSynchronize(str[i]);
#endif
    }
  threadInfo ti[num_pthreads];
  pthread_t threads[num_pthreads];
  for (int i = 0; i < num_pthreads; i++){
    ti[i].data = data[i];
    ti[i].my_N = ds;
    ti[i].s = str[i];
    int rs = pthread_create(threads+i, NULL, threadFunc, (void *) (ti+i));
    if (rs != 0) std::cout << "pthread_create error: " << rs << std::endl;}
  for (int i = 0; i < num_pthreads; i++){
    int rs = pthread_join(threads[i], NULL);
    if (rs != 0) std::cout << "pthread_join error: " << rs << std::endl;}
  for (int i = 0; i < num_pthreads; i++)
    std::cout << "thread: " << i << " expected value: " << num_loops << " final value: " << ti[i].test << std::endl;
  return 0;
}

$ nvcc -o t2219 t2219.cu -arch=sm_35
nvcc warning : The 'compute_35', 'compute_37', 'compute_50', 'sm_35', 'sm_37' and 'sm_50' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning).
$ CUDA_VISIBLE_DEVICES="1" ./t2219
Bus error (core dumped)
$ nvcc -o t2219 t2219.cu -arch=sm_35 -DUSE_STREAM_ATTACH
nvcc warning : The 'compute_35', 'compute_37', 'compute_50', 'sm_35', 'sm_37' and 'sm_50' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning).
$ CUDA_VISIBLE_DEVICES="1" ./t2219
thread: 0 expected value: 1024 final value: 1024
thread: 1 expected value: 1024 final value: 1024
thread: 2 expected value: 1024 final value: 1024
thread: 3 expected value: 1024 final value: 1024
$

You may wish to refer to the programming guide to learn more about stream memory attachment.