why gld_efficiency only 50% when using L1 cache in Pascal

A global memory read is aligned and coalesced, however, the gld_efficiency is only 50% when using nvcc option -Xptxas -dlcm=ca to enable L1/texture cache.
The code copy from the book Professional CUDA C Programming.

#include "../common/common.h"
#include <cuda_runtime.h>
#include <stdio.h>

/*
 * This example demonstrates the impact of misaligned reads on performance by
 * forcing misaligned reads to occur on a float*.
 */

void checkResult(float *hostRef, float *gpuRef, const int N)
{
    double epsilon = 1.0E-8;
    bool match = 1;

    for (int i = 0; i < N; i++)
    {
        if (abs(hostRef[i] - gpuRef[i]) > epsilon)
        {
            match = 0;
            printf("different on %dth element: host %f gpu %f\n", i, hostRef[i],
                    gpuRef[i]);
            break;
        }
    }

    if (!match)  printf("Arrays do not match.\n\n");
}

void initialData(float *ip,  int size)
{
    for (int i = 0; i < size; i++)
    {
        ip[i] = (float)( rand() & 0xFF ) / 100.0f;
    }

    return;
}

void sumArraysOnHost(float *A, float *B, float *C, const int n, int offset)
{
    for (int idx = offset, k = 0; idx < n; idx++, k++)
    {
        C[k] = A[idx] + B[idx];
    }
}

__global__ void warmup(float *A, float *B, float *C, const int n, int offset)
{
    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
    unsigned int k = i + offset;

    if (k < n) C[i] = A[k] + B[k];
}

__global__ void readOffset(float *A, float *B, float *C, const int n,
                           int offset)
{
    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
    unsigned int k = i + offset;

    if (k < n) C[i] = A[k] + B[k];
}

__global__ void readTest(float *A, float *C, const int n, int offset)
{
	unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
	unsigned int k = i + offset;
	if (k < n) C[i] = A[k];
}

int main(int argc, char **argv)
{
    // set up device
    int dev = 0;
    cudaDeviceProp deviceProp;
    CHECK(cudaGetDeviceProperties(&deviceProp, dev));
    printf("%s starting reduction at ", argv[0]);
    printf("device %d: %s ", dev, deviceProp.name);
    CHECK(cudaSetDevice(dev));

    // set up array size
    int nElem = 1 << 20; // total number of elements to reduce
    printf(" with array size %d\n", nElem);
    size_t nBytes = nElem * sizeof(float);

    // set up offset for summary
    int blocksize = 512;
    int offset = 0;

    if (argc > 1) offset    = atoi(argv[1]);

    if (argc > 2) blocksize = atoi(argv[2]);

    // execution configuration
    dim3 block (blocksize, 1);
    dim3 grid  ((nElem + block.x - 1) / block.x, 1);

    // allocate host memory
    float *h_A = (float *)malloc(nBytes);
    float *h_B = (float *)malloc(nBytes);
    float *hostRef = (float *)malloc(nBytes);
    float *gpuRef  = (float *)malloc(nBytes);

    //  initialize host array
    initialData(h_A, nElem);
    memcpy(h_B, h_A, nBytes);

    //  summary at host side
    sumArraysOnHost(h_A, h_B, hostRef, nElem, offset);

    // allocate device memory
    float *d_A, *d_B, *d_C, *d_test;
    CHECK(cudaMalloc((float**)&d_A, nBytes));
    CHECK(cudaMalloc((float**)&d_B, nBytes));
    CHECK(cudaMalloc((float**)&d_C, nBytes));
    CHECK(cudaMalloc((float**)&d_test, nBytes));

    // copy data from host to device
    CHECK(cudaMemcpy(d_A, h_A, nBytes, cudaMemcpyHostToDevice));
    CHECK(cudaMemcpy(d_B, h_A, nBytes, cudaMemcpyHostToDevice));

    //  kernel 1:
    double iStart = seconds();
    warmup<<<grid, block>>>(d_A, d_B, d_C, nElem, offset);
    CHECK(cudaDeviceSynchronize());
    double iElaps = seconds() - iStart;
    printf("warmup     <<< %4d, %4d >>> offset %4d elapsed %f sec\n", grid.x,
           block.x, offset, iElaps);
    CHECK(cudaGetLastError());

    iStart = seconds();
    readOffset<<<grid, block>>>(d_A, d_B, d_C, nElem, offset);
    CHECK(cudaDeviceSynchronize());
    iElaps = seconds() - iStart;
    printf("readOffset <<< %4d, %4d >>> offset %4d elapsed %f sec\n", grid.x,
           block.x, offset, iElaps);
    CHECK(cudaGetLastError());

    iStart = seconds();
    readTest<<<grid, block>>>(d_A, d_test, nElem, offset);
    CHECK(cudaDeviceSynchronize());
    iElaps = seconds() - iStart;
    printf("readTest <<<%4d, %4d>>> offset %4d elapsed %f sec\n", grid.x, block.x,
			block.x, offset, iElaps);
    CHECK(cudaGetLastError());

    // copy kernel result back to host side and check device results
    CHECK(cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost));
    checkResult(hostRef, gpuRef, nElem - offset);

    // free host and device memory
    CHECK(cudaFree(d_A));
    CHECK(cudaFree(d_B));
    CHECK(cudaFree(d_C));
    CHECK(cudaFree(d_test));
    free(h_A);
    free(h_B);

    // reset device
    CHECK(cudaDeviceReset());
    return EXIT_SUCCESS;
}

When compile the code with the following command:

nvcc -arch=sm_61 readSegment.cu -o readSegment

For different offset values:

nvprof --metrics gld_efficiency ./readSegment 0

Metric result:
Invocations                               Metric Name                        Metric Description         Min         Max         Avg
Device "TITAN Xp (0)"
    Kernel: readOffset(float*, float*, float*, int, int)
          1                            gld_efficiency             Global Memory Load Efficiency     100.00%     100.00%     100.00%
    Kernel: readTest(float*, float*, int, int)
          1                            gld_efficiency             Global Memory Load Efficiency     100.00%     100.00%     100.00%
    Kernel: warmup(float*, float*, float*, int, int)
          1                            gld_efficiency             Global Memory Load Efficiency     100.00%     100.00%     100.00%
nvprof --metrics gld_efficiency ./readSegment 11

Metric result:
Invocations                               Metric Name                        Metric Description         Min         Max         Avg
Device "TITAN Xp (0)"
    Kernel: readOffset(float*, float*, float*, int, int)
          1                            gld_efficiency             Global Memory Load Efficiency      80.00%      80.00%      80.00%
    Kernel: readTest(float*, float*, int, int)
          1                            gld_efficiency             Global Memory Load Efficiency      80.00%      80.00%      80.00%
    Kernel: warmup(float*, float*, float*, int, int)
          1                            gld_efficiency             Global Memory Load Efficiency      80.00%      80.00%      80.00%
nvprof --metrics gld_efficiency ./readSegment 128

Metric result:
Invocations                               Metric Name                        Metric Description         Min         Max         Avg
Device "TITAN Xp (0)"
    Kernel: readOffset(float*, float*, float*, int, int)
          1                            gld_efficiency             Global Memory Load Efficiency     100.00%     100.00%     100.00%
    Kernel: readTest(float*, float*, int, int)
          1                            gld_efficiency             Global Memory Load Efficiency     100.00%     100.00%     100.00%
    Kernel: warmup(float*, float*, float*, int, int)
          1                            gld_efficiency             Global Memory Load Efficiency     100.00%     100.00%     100.00%

When compile the code with the following command to enable L1/Texture cache:

nvcc -arch=sm_61 -Xptxas -dlcm=ca readSegment.cu -o enableL1ReadSegment

For different offset values:

nvprof --metrics gld_efficiency ./enableL1ReadSegment 0

Metric result:
Invocations                               Metric Name                        Metric Description         Min         Max         Avg
Device "TITAN Xp (0)"
    Kernel: readOffset(float*, float*, float*, int, int)
          1                            gld_efficiency             Global Memory Load Efficiency      50.00%      50.00%      50.00%
    Kernel: readTest(float*, float*, int, int)
          1                            gld_efficiency             Global Memory Load Efficiency      50.00%      50.00%      50.00%
    Kernel: warmup(float*, float*, float*, int, int)
          1                            gld_efficiency             Global Memory Load Efficiency      50.00%      50.00%      50.00%
nvprof --metrics gld_efficiency ./enableL1ReadSegment 11

Metric result:
Invocations                               Metric Name                        Metric Description         Min         Max         Avg
Device "TITAN Xp (0)"
    Kernel: readOffset(float*, float*, float*, int, int)
          1                            gld_efficiency             Global Memory Load Efficiency      40.00%      40.00%      40.00%
    Kernel: readTest(float*, float*, int, int)
          1                            gld_efficiency             Global Memory Load Efficiency      40.00%      40.00%      40.00%
    Kernel: warmup(float*, float*, float*, int, int)
          1                            gld_efficiency             Global Memory Load Efficiency      40.00%      40.00%      40.00%
nvprof --metrics gld_efficiency ./enableL1ReadSegment 128

Metric result:
Invocations                               Metric Name                        Metric Description         Min         Max         Avg
Device "TITAN Xp (0)"
    Kernel: readOffset(float*, float*, float*, int, int)
          1                            gld_efficiency             Global Memory Load Efficiency      50.00%      50.00%      50.00%
    Kernel: readTest(float*, float*, int, int)
          1                            gld_efficiency             Global Memory Load Efficiency      50.00%      50.00%      50.00%
    Kernel: warmup(float*, float*, float*, int, int)
          1                            gld_efficiency             Global Memory Load Efficiency      50.00%      50.00%      50.00%

Why L1 cache causes efficiency to be half when not using cache ?
I know a L1 cache line is 128 bytes, but this does not explain the above phenomenon very well.
I hope to get a more reasonable explanation.
My GPU is Titan XP.
Thanks