Question about GPU L2 cache memory access。

__global__ void saxpy(int n, float a, float *x, float *y)
{
  int i = blockIdx.x*blockDim.x + threadIdx.x;
  if (i < n) y[i] = a*x[i];
}

There is a kernel and ncu result as shown above,data size is 2048 * 2048 float = 16MB
GPU:4070
we have some question:

  1. Why is the device memory only loaded with 16MB, but the store is few bytes? Why are load and store not equal?
  2. Why is the L1cache hit rate 0%,L2 cache hit rate 50%?
    Thanks for the answer。

Hi, @Echo1109

Welcome to the Nsight Compute Forum.
Can you please provide the full src code for us to compile and generate a report ? So we can check in details. Thanks !

Hi @veraj

#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <stdio.h>

__global__ void saxpy(int n, float a, float *x, float *y)
{
  int i = blockIdx.x * blockDim.x + threadIdx.x;
  if (i < n) y[i] = a * x[i];
}

int main(void)
{
  int N = 2048 * 2048;
  float *x, *y, *d_x, *d_y;

  x = (float*)malloc(N*sizeof(float));
  y = (float*)malloc(N*sizeof(float));

  for (int i = 0; i < N; i++) {
    x[i] = 1.0f;
    y[i] = 2.0f;
  }

  cudaMalloc(&d_x, N*sizeof(float));
  cudaMalloc(&d_y, N*sizeof(float));

  cudaEvent_t start, stop;

  cudaEventCreate(&start);
  cudaEventCreate(&stop);

  cudaMemcpy(d_x, x, N*sizeof(float), cudaMemcpyHostToDevice);
  cudaMemcpy(d_y, y, N*sizeof(float), cudaMemcpyHostToDevice);

  //warm up
  for (int i = 0; i < 3; i++)
     saxpy<<<(N + 1023) / 1024, 1024>>>(N, 2.0f, d_x, d_y);

  cudaEventRecord(start);
    saxpy<<<(N + 1023) / 1024, 1024>>>(N, 2.0f, d_x, d_y);

  cudaEventRecord(stop);

  cudaMemcpy(y, d_y, N*sizeof(float), cudaMemcpyDeviceToHost);

  cudaEventSynchronize(stop);

  float milliseconds = 0;
  cudaEventElapsedTime(&milliseconds, start, stop);


  printf("N %d time : %f ms\n", N, milliseconds);
  printf("Effective Bandwidth (GB/s): %f\n", N * 4 * 2 / milliseconds / 1e6);
}

Thanks!

In the NVIDIA GPU Architecture the GPU L2 Cache is the Point of Coherence for GPU Device Memory. The RTX 4070 has > 30 MB of L2 cache which is greater than the size of the output buffer. The write data has not be evicted from the L2 cache; therefore, the Device Memory write size is only 2.43 KB vs. expected ~16.78 MB.

If you change N to be 100x larger then the write value to Device Memory should be within a few percent of the expected value.

Thank you for your answer!
I understand.