unify memory and tasking

Hello all,

I am refreshing my mind with cuda, specially the unify memory (my last real cuda dev was 3 years ago), I am a bit rusted.

The pb:

I am creating a task from a container using unify memory. However, I get a crash, after a few days of investigation,
I am not able to say where is the crash (copy constructor), but not why. Because all pointers are allocated correctly.

I am not in contraction with Nvidia post (https://devblogs.nvidia.com/parallelforall/unified-memory-in-cuda-6/)
about C++ and unify memory

Help very appreciated

Best,

Timocafe

my machine: cuda 7.5, gcc 4.8.2, Tesla K20 m

#include <cuda.h>
#include <cstdio>

template<class T>
struct container{
    container(int size = 1){ cudaMallocManaged(&p,size*sizeof(T));}
    ~container(){cudaFree(p);}
    __device__ __host__ T& operator[](int i){ return p[i];}
    T * p;
};

struct task{
    int* a;
};

__global__ void kernel_gpu(task& t, container<task>& v){ 
    printf(" gpu value task %i, should be 2 \n", *(t.a)); // this work
    task tmp(v[0]); // BUG
    printf(" gpu value task from vector %i, should be 1 \n", *(tmp.a));
}

void kernel_cpu(task& t, container<task>& v){ 
    printf(" cpu value task %i, should be 2 \n", *(t.a)); // this work
    task tmp(v[0]);
    printf(" cpu value task from vector %i, should be 1 \n", *(tmp.a));
}

int main(int argc, const char * argv[]) {
    int* p1; 
    int* p2; 
    cudaMallocManaged(&p1,sizeof(int));
    cudaMallocManaged(&p2,sizeof(int));
    *p1 = 1;
    *p2 = 2;

    task t1,t2;
    t1.a=p1;
    t2.a=p2;

    container<task> c(2);

    c[0] = t1; 
    c[1] = t2; 

    //gpu does not work
    kernel_gpu<<<1,1>>>(c[1],c);
    cudaDeviceSynchronize();

    //cpu should work, no concurent access
    kernel_cpu(c[1],c);

    printf("job done !\n");

    cudaFree(p1);
    cudaFree(p2);

    return 0;
}