Nvprof and Nsight returning different results for L1 and L2 cache hit rates

I am getting different cache stats for L1 and L2 after evaluating the same executable via nvprof and nsight compute.

The machine configurations are:
GPU : P40
CUDA version : 10.1

#include <stdio.h>
#include "cuda_profiler_api.h"

__global__ void initialization(int n, float a, float *x, float *y)
{
  int i = blockIdx.x * blockDim.x + threadIdx.x;
  if (i<n/2) {
    x[i] = i;
    y[i] = 2*i;
  }
}

__global__
void saxpy(int n, float a, float *x, float *y)
{
  int i = blockIdx.x*blockDim.x + threadIdx.x;
  if (i < n) y[i] = a*x[i] + y[i];
  int sum = 0;
  //if (i < n) sum = sum + y[i];
  //if (i == n) printf("%d", sum); 
}

int main(void)
{
  int N = 1<<20;
  float *x, *y, *d_x, *d_y;

  cudaHostAlloc((void **)&x,  N*sizeof(float), cudaHostAllocMapped );
  cudaHostAlloc((void **)&y,  N*sizeof(float), cudaHostAllocMapped );

  cudaHostGetDevicePointer((void **)&d_x, x, 0);
  cudaHostGetDevicePointer((void **)&d_y, y, 0);

  for (int i = 0; i < N; i++) {
    x[i] = 1.0f;
    y[i] = 2.0f;
  }

  cudaDeviceSynchronize();
  // Perform SAXPY on 1M elements
  //initialization<<<(N+255)/256, 256>>>(N, 2.0f, d_x, d_y);
  cudaProfilerStart();
  saxpy<<<(N+255)/256, 256>>>(N, 2.0f, d_x, d_y);
  cudaProfilerStop();
  cudaDeviceSynchronize();

  float maxError = 0.0f;
  for (int i = 0; i < N; i++)
    maxError = max(maxError, abs(y[i]-4.0f));
  printf("Max error: %f\n", maxError);

  cudaFree(d_x);
  cudaFree(d_y);
  cudaFreeHost(x);
  cudaFreeHost(y);
}

cuda compilation : nvcc -Xptxas -O0 -Xptxas -dlcm=cg -Xptxas -dscm=cg -o saxpy_orig_cg_cg saxpy.cu

nvvp results :
Unified cache hit rate : 50%
L2 cache hit rate : 33%

nsight compute results :
Unified cache hit rate : 0.00%
L2 cache hit rate : 42.23&

It would be really great if someone could help me with this.
Thanks in advance.