In your case, it should be only :
cudaSetDeviceFlags(cudaDeviceMapHost);
float *h_a; // address of buffer from CPU side
float *d_a; // address of buffer from GPU side
cudaHostAlloc((void **)&h_a, size*sizeof(float), cudaHostAllocMapped); // Allocate buffer and get its CPU side address
cudaHostGetDevicePointer((void **)&d_a, (void *)h_a, 0); // Get GPU side address of buffer
...//Fill your buffer from CPU using address h_a
kernel<<<blocks, threads>>>(d_a); // Execute kernel on GPU using address d_a
...//Read processed buffer from CPU with address h_a
It is important to pass flag cudaHostAllocMapped to cudaHostAlloc so that the memory will be allocated in pinned memory mapped into CUDA address space, accessible from CPU (with address in h_a) or GPU (with address in d_a as returned by cudaHostGetDevicePointer). Memory allocated by malloc is useless.