Unified memory and 1D surfaces error

Hello.

I am unsuccessfully trying to create and use Surface objects with memory allocated with cudaMallocManaged.
Here an example:

#include <cuda_runtime.h>
#include <stdio.h>

#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
   if (code != cudaSuccess) 
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}

#define ELEMS 4

// just print 'hello' + elements in surf1
__global__ void print_kernel(cudaSurfaceObject_t surf1){

    if(threadIdx.x == 0){
        printf("hello\n");
        for(int x=0; x<ELEMS; x++){
            printf("%d\n", surf1Dread<int>(surf1, x*sizeof(int)));
        }
    }
}

int main(){

int surfhost1[] = {10, 20, 30, 40};
    int* surfshared1;

    gpuErrchk(cudaMallocManaged(&surfshared1, 4*sizeof(int)));

    // copy to shared memory region
    for(int i=0; i<4; i++){
        surfshared1[i] = surfhost1[i];
    }

    // 1D integer surface
    cudaSurfaceObject_t surfdev1;
    cudaChannelFormatDesc channelDesc1 =  cudaCreateChannelDesc<int>();

    // Resource descriptor
    cudaResourceDesc res_desc1;
    res_desc1.resType = cudaResourceTypeLinear;
    res_desc1.res.linear.devPtr = surfshared1;
    res_desc1.res.linear.desc = channelDesc1;
    res_desc1.res.linear.sizeInBytes = 4*sizeof(int);

    gpuErrchk(cudaMemPrefetchAsync(surfshared1, 4*sizeof(int), 0));
    
    // is it necessary or not?
    gpuErrchk(cudaDeviceSynchronize());

    // the program fails here
    gpuErrchk(cudaCreateSurfaceObject(&surfdev1, &res_desc1));

    print_kernel<<<1,1>>>(surfdev1);

    gpuErrchk(cudaDeviceSynchronize());

    cudaDestroySurfaceObject(surfdev1);
    cudaFree(surfshared1);
}

The code above fails when cudaCreateSurfaceObject is called with the following error output “GPUassert: invalid argument test_cuda_2.cu 57”.

I am using NVIDIA CUDA toolkit V9.1.85 on Ubuntu 18.04. My graphic card is a Geforce 1060.
To compile the above example I use:

nvcc test_cuda_2.cu --gpu-architecture=sm_61 -o test_cuda_2

Thank you in advance!

It’s not supported. Use ordinary device memory via e.g. cudaMalloc

Thanks for the answer.
By the way, isn’t there any other supported way to cache unified address space memory?