Demo delivers segmentation fault

when i copy from " Unified Memory for CUDA Beginners" the code and compile with cmake

# CMakeLists.txt to build hellocuda.cu
cmake_minimum_required(VERSION 2.8)
find_package(CUDA QUIET REQUIRED)
 
# Specify binary name and source file to build it from
cuda_add_executable(
    current
    m4emtest.cu)  << renamed the democode

the program crashes between "ST wo " AND "ST hree "

   int main(void)
    {
      int N = 1<<20;
      float *x, *y;
         printf("ST art \n");
      // Allocate Unified Memory -- accessible from CPU or GPU
      cudaMallocManaged(&x, N*sizeof(float));
      cudaMallocManaged(&y, N*sizeof(float));//1048576
      
      // initialize x and y arrays on the host
       printf("ST wo \n");
      for (int i = 0; i < N; i++) {
        x[i] = 1.0f;
        y[i] = 2.0f;
      }
              printf("ST hree \n");

why is that ?

Can anyone pick this up please, programming in CUDA gets very difficult otherwise.

cudaMallocManaged seems not implemented (yet),
work around use “cudaMalloc” and the couple “cudaMemcpyHostToDevice” and “cudaMemcpyDeviceToHost”
demo program does work on Orin Nano Developerkit

#include <unistd.h>
#include <stdio.h>
/* we need these includes for CUDA's random number stuff */
#include <curand.h>
#include <curand_kernel.h>

#define N 25
#define MAX 501
/* this GPU kernel function is used to initialize the random states */
__global__ void init(unsigned int seed, curandState_t* states) {
  /* we have to initialize the state */
  curand_init(seed, /* the seed can be the same for each core, here we pass the time in from the CPU */
              blockIdx.x, /* the sequence number should be different for each core (unless you want all
                             cores to get the same sequence of numbers for some reason - use thread id! */
              0, /* the offset is how much extra we advance in the sequence for each call, can be 0 */
              &states[blockIdx.x]);
}

/* this GPU kernel takes an array of states, and an array of ints, and puts a random int into each */
__global__ void randoms(curandState_t* states, unsigned int* numbers) {
  /* curand works like rand - except that it takes a state as a parameter */
  numbers[blockIdx.x] = curand(&states[blockIdx.x]) % 501;
}

int main() {
  /* CUDA's random number library uses curandState_t to keep track of the seed value
     we will store a random state for every thread  */
  curandState_t* states;
  /* allocate space on the GPU for the random states */
  cudaMalloc((void**) &states, N * sizeof(curandState_t));
  /* invoke the GPU to initialize all of the random states */
  init<<<N, 1>>>(time(0), states);
  /* allocate an array of unsigned ints on the CPU and GPU */
  unsigned int cpu_nums[N];
  unsigned int* gpu_nums;
  cudaMalloc((void**) &gpu_nums, N * sizeof(unsigned int));
  /* invoke the kernel to get some random numbers */
  randoms<<<N, 1>>>(states, gpu_nums);
  /* copy the random numbers back */
  cudaMemcpy(cpu_nums, gpu_nums, N * sizeof(unsigned int), cudaMemcpyDeviceToHost);
  /* print them out */
  for (int i = 0; i < N; i++) {
    printf("%u\n", cpu_nums[i]);
  }
  /* free the memory we allocated for the states and numbers */
  cudaFree(states);
  cudaFree(gpu_nums);
  return 0;
}