Surface Memory Slower Than Normal Access

According to the Performance Guidelines (5.3.2. Device Memory Accesses) of the CUDA Programming Guide, “reading device memory through texture or surface fetching present some benefits that can make it an advantageous alternative to reading device memory from global or constant memory”. I’ve tried benchmarking surface fetching vs. normal memory access on NVIDIA Jetson Orin modules (-arch=sm_87) using 1D/2D surface objects, various grid/block sizes, data types, and surface sizes. However, normal access was always around 25% faster. Here are the sample programs:

Surface Fetching

__global__ void kernel(cudaSurfaceObject_t surf, size_t w, size_t h) {
    size_t x = blockIdx.x * blockDim.x + threadIdx.x;
    size_t y = blockIdx.y * blockDim.y + threadIdx.y;

    if (x < w && y < h) {
        float f = surf2Dread<float>(surf, x * 4, y);
        surf2Dwrite(f + x + y, surf, x * 4, y);
    }
}

int main() {
    const size_t W = 16384;
    const size_t H = 2024;
    const size_t S = sizeof(float) * W * H;
    const size_t P = sizeof(float) * W;
    dim3 blocks(256, 32);
    dim3 grid((W + blocks.x - 1) / blocks.x, (H + blocks.y - 1) / blocks.y);

    float *h_data = (float *)malloc(S);
    
    for (int i = 0; i < W * H; ++i) h_data[i] = (float)i;

    cudaChannelFormatDesc c_desc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
    cudaArray_t d_arr;
    cudaMallocArray(&d_arr, &c_desc, W, H, cudaArraySurfaceLoadStore);
    cudaMemcpy2DToArray(d_arr, 0, 0, h_data, P, P, H, cudaMemcpyHostToDevice);

    struct cudaResourceDesc r_desc;
    memset(&r_desc, 0, sizeof(r_desc));
    r_desc.resType = cudaResourceTypeArray;
    r_desc.res.array.array = d_arr;

    cudaSurfaceObject_t surf = 0;    
    cudaCreateSurfaceObject(&surf, &r_desc);

    for (int i = 0; i < 12; ++i) {
        kernel<<<grid, blocks>>>(surf, W, H);
    }

    cudaMemcpy2DFromArray(h_data, P, d_arr, 0, 0, P, H, cudaMemcpyDeviceToHost);

    cudaDestroySurfaceObject(surf);
    cudaFreeArray(d_arr);
    free(h_data);

    return 0;
}

Normal Access

__global__ void kernel(float* d_data, size_t w, size_t h) {
    size_t x = blockIdx.x * blockDim.x + threadIdx.x;
    size_t y = blockIdx.y * blockDim.y + threadIdx.y;

    if (x < w && y < h) {
        d_data[y * w + x] = d_data[y * w + x] + (float)(x + y);
    }
}

int main() {
    const size_t W = 16384;
    const size_t H = 2024;
    const size_t S = sizeof(float) * W * H;
    dim3 blocks(256, 32);
    dim3 grid((W + blocks.x - 1) / blocks.x, (H + blocks.y - 1) / blocks.y);

    float *h_data = (float *)malloc(S);

    for (int i = 0; i < W * H; ++i) h_data[i] = (float)i;

    float *d_data = NULL;
    cudaMalloc((void **)&d_data, S);
    cudaMemcpy(d_data, h_data, S, cudaMemcpyHostToDevice);

    for (int i = 0; i < 12; ++i) {
        kernel<<<grid, blocks>>>(d_data, W, H);
    }

    cudaMemcpy(h_data, d_data, S, cudaMemcpyDeviceToHost);

    cudaFree(d_data);
    free(h_data);

    return 0;
}

I’ve also tried compiling with various optimisation flags but the result is the same. Could anyone provide an explanation for the results and ideally explain how to properly utilize the texture cache?