Why the L2 cache hit rate in L1 store requests is 100%


I was testing an simple vector_add sample,I just don’t know why the L2 cache hit rate in L1 store requests is 100%,and the total is 33.33%。 code as blow:
define FLOAT4(value) reinterpret_cast<float4*>(&value)[0]
global void modify(float* a, float* b, float* c, int N)
{
int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid >= N)
{
return;
}
c[tid] = a[tid] + b[tid];
}

int main()
{
printf(“_____%d\n”, 0 | -1);
int N = 500;

float* h_a = ( float* )malloc(N * sizeof(float));

for (int i = 0; i < N; i++)
{
    h_a[i] = i;
}
float* d_a;
cudaMalloc(( void** )&d_a, sizeof(float) * N);
cudaMemcpy(d_a, h_a, sizeof(float) * N, cudaMemcpyHostToDevice);

float* h_b = ( float* )malloc(N * sizeof(float));
for (int i = 0; i < N; i++)
{
    h_b[i] = i + N;
}
float* d_b;
cudaMalloc(( void** )&d_b, sizeof(float) * N);
cudaMemcpy(d_b, h_b, sizeof(float) * N, cudaMemcpyHostToDevice);

float* res;
cudaMalloc(( void** )&res, sizeof(float) * N);
int blocksize = 256;
int gridsize  = (N - 1 + blocksize) / (blocksize);
modify<<<gridsize, 256>>>(d_a, d_b, res, N);
cudaDeviceSynchronize();
cudaCheck(cudaGetLastError());

thrust::device_ptr<float> d_res(res);
thrust::copy(d_res, d_res + 100, std::ostream_iterator<float>(std::cout, " "));
std::cout << std::endl;
cudaFree(d_a);
cudaFree(d_b);
cudaFree(res);
free(h_a);
free(h_b);

}