Some confusion about mem_texture_op_tex metrics

i test this metrics lts__t_bytes_equiv_l1sectormiss_pipe_tex_mem_texture_op_tex (# of bytes requested for TEX instructions)through this kernel:

__global__ void LtsTBytesEquivL1sectormissPipeTexMemTextureKernel0(float *output, cudaTextureObject_t tex_obj,
                                                                   int width, int height, float theta) {
    // Calculate normalized texture coordinates
    unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
    unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;

    float u = x / static_cast<float>(width);
    float v = y / static_cast<float>(height);

    // Transform coordinates
    // u -= 0.5f;
    // v -= 0.5f;
    // float tu = u * cosf(theta) - v * sinf(theta) + 0.5f;
    // float tv = v * cosf(theta) + u * sinf(theta) + 0.5f;

    // Read from texture and write to global memory
    output[y * width + x] = tex2D<float>(tex_obj, u, v);
}

void LtsTBytesEquivL1sectormissPipeTexMemTexture() {
    const int test_num = 1;
    const int loop_num = 1;
    int grid_num[test_num]{32};
    int thread_num[test_num]{32};
    float *h_src[test_num] = {nullptr};
    float *d_res[test_num] = {nullptr};

    float angle = 0.5;
    cudaChannelFormatDesc channel_desc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
    cudaArray_t cu_array[test_num];

    std::srand(time(nullptr));  // use current time as seed for random generator
    CUDA_CHECK(cudaSetDevice(0));
    for (int i = 0; i < loop_num; i++) {
        dim3 block(thread_num[i]);
        dim3 grid(grid_num[i]);

        // alloc cpu memory
        h_src[i] = new float[thread_num[i] * grid_num[i]];
        for (int j = 0; j < grid_num[i] * thread_num[i]; j++) {
            h_src[i][j] = (std::rand() % 255);
        }

        // Allocate CUDA array in device memory
        CUDA_CHECK(cudaMallocArray(&cu_array[i], &channel_desc, thread_num[i], grid_num[i]));

        // Set pitch of the source (the width in memory in bytes of the 2D array pointed
        // to by src, including padding), we dont have any padding
        const size_t spitch = thread_num[i] * sizeof(float);

        // Copy data located at address h_data in host memory to device memory
        CUDA_CHECK(cudaMemcpy2DToArray(cu_array[i], 0, 0, h_src[i], spitch, thread_num[i] * sizeof(float), grid_num[i],
                                       cudaMemcpyHostToDevice));

        // Specify texture
        struct cudaResourceDesc res_desc;
        memset(&res_desc, 0, sizeof(res_desc));
        res_desc.resType = cudaResourceTypeArray;
        res_desc.res.array.array = cu_array[i];

        // Specify texture object parameters
        struct cudaTextureDesc tex_desc;
        memset(&tex_desc, 0, sizeof(tex_desc));
        tex_desc.addressMode[0] = cudaAddressModeWrap;
        tex_desc.addressMode[1] = cudaAddressModeWrap;
        tex_desc.filterMode = cudaFilterModeLinear;
        tex_desc.readMode = cudaReadModeElementType;
        tex_desc.normalizedCoords = 1;

        // Create texture object
        cudaTextureObject_t tex_obj = 0;
        cudaCreateTextureObject(&tex_obj, &res_desc, &tex_desc, NULL);

        // Allocate result of transformation in device memory
        CUDA_CHECK(cudaMalloc(&d_res[i], grid_num[i] * thread_num[i] * sizeof(float)));

        // kernel
        switch (i) {
            case 0: {
                LtsTBytesEquivL1sectormissPipeTexMemTextureKernel0<<<grid, block>>>(d_res[i], tex_obj, thread_num[i],
                                                                                    grid_num[i], angle);
                break;
            }
            default:
                break;
        }

        CUDA_CHECK(cudaStreamSynchronize(0));
        CUDA_CHECK(cudaGetLastError());

        // Destroy texture object
        cudaDestroyTextureObject(tex_obj);
        cudaFreeArray(cu_array[i]);

        delete[] h_src[i];
        CUDA_FREE(d_res[i]);
    }
}

i lanuch my kernel with (32,32) threads, and the kernel just read them ,and write to global memory(without Transform coordinates), here is the results:

 lts__t_bytes_equiv_l1sectormiss_pipe_tex_mem_texture_op_tex.avg                   byte                            512
    lts__t_bytes_equiv_l1sectormiss_pipe_tex_mem_texture_op_tex.max                   byte                            nan
    lts__t_bytes_equiv_l1sectormiss_pipe_tex_mem_texture_op_tex.min                   byte                            nan
    lts__t_bytes_equiv_l1sectormiss_pipe_tex_mem_texture_op_tex.sum                  Kbyte                          16.38

i want to ask why it is 16384byte , i only malloc 32324=4096byte?