Measuring cache access latency

In order to measure the cache hit latency, I have wrote the following code which consists of three kernels:
1- dummyKernel() which is used to measure the time overhead of a kernel launch.
2- accessElement1() which accesses an element of array with position ‘i
3- accessElement2() which accesses an element of array with position ‘i+stride

There is 1 thread and the blocks in grid is less than 1025.

#include <iostream>
    #include <math.h>
    #include <stdlib.h>

__global__ void dummyKernel()
    {
    }
    __global__ void accessElement1( int *a )
    {
      int i = blockDim.x * blockIdx.x + threadIdx.x;
      volatile int z = a[ i ];
    }
    __global__ void accessElement2( int *a, int stride )
    {
      int i = blockDim.x * blockIdx.x + threadIdx.x;
      volatile int z = a[ i+stride ];
    }
    
    int main(int argc, char** argv)
    {
      int N = strtol(argv[1], &argv[ 1 ], 10);
      printf("N is %d\n", N);
      int stride = strtol(argv[2], &argv[ 2 ], 10);
      printf("stride is %d\n", stride);
      
      size_t bytes = N * sizeof( int );
      int *A = (int *)malloc(bytes);
      int *d_A;
      cudaMalloc( &d_A, bytes );

for ( int i = 0; i < N; i++ ) {
        A[ i ] = 1;
      }
      
      cudaMemcpy( d_A, A, bytes, cudaMemcpyHostToDevice );
      
      int thr_per_blk = 1;
      int blk_in_grid = ceil( int( N ) / thr_per_blk );
      
      cudaEvent_t start, stop;
      float elapsedTime;
      cudaEventCreate(&start);
      cudaEventCreate(&stop);  
    
      cudaEventRecord(start,0);
      dummyKernel<<< blk_in_grid, thr_per_blk >>>();
      cudaEventRecord(stop,0);
      cudaEventSynchronize(stop);
      cudaEventElapsedTime(&elapsedTime, start, stop);
      printf("Dummy elapsed time : %f ms\n" , elapsedTime);

      cudaEventRecord(start,0);
      accessElement1<<< blk_in_grid, thr_per_blk >>>( d_A );
      cudaEventRecord(stop,0);
      cudaEventSynchronize(stop);
      cudaEventElapsedTime(&elapsedTime, start, stop);
      printf("Elapsed time1 : %f ms\n" , elapsedTime);

      cudaEventRecord(start,0);
      accessElement2<<< blk_in_grid, thr_per_blk >>>( d_A, stride );
      cudaEventRecord(stop,0);
      cudaEventSynchronize(stop);
      cudaEventElapsedTime(&elapsedTime, start, stop);
      printf("Elapsed time2 : %f ms\n" , elapsedTime);      
            
      free( A );
      cudaFree( d_A );
      
      return 0;
    }

A typical output looks like

$ ./element_access 100 1
N is 100
stride is 1
Dummy elapsed time : 0.016320 ms
Elapsed time1 : 0.007552 ms
Elapsed time2 : 0.007264 ms

There are two questions here:
1- The dummy time is much larger than the other two. Therefore I can not use “time1 - dummy” to find the actually time of accessElement1().
It seems that just one launch takes time but subsequent launches have small time. Is something cache here?

2- Accessing A[i] and A[i+1] have the same time. That means A is cached when I launch accessElement1().

Is there any idea about how to get a clear reason for that?