Dram_write_bytes result on P100

Hi, I used nvprof to profile a simple vecadd example (n=1024) on P100 but observed the dram_write_bytes is only 256 (rather than 1024*4 that I expected). Can someone explain why this number is small? What other metrics I need to add in to count for global memory writes? Thanks. float_count_sp number is correct (1024).

#include <stdio.h>
#include <stdlib.h>
#include <math.h>

__global__ void vecAdd(float* a, float* b, float* c, int n){
    int id = blockIdx.x*blockDim.x + threadIdx.x;
    if(id < n) c[id] = a[id] + b[id];

int main(int argc, char* argv[]){
    int n = 1024;
    float *h_a, *d_a;
    float *h_b, *d_b;
    float *h_c, *d_c;
    size_t bytes = n*sizeof(float);
    h_a = (float*)malloc(bytes);
    h_b = (float*)malloc(bytes);
    h_c = (float*)malloc(bytes);
    cudaMalloc(&d_a, bytes);
    cudaMalloc(&d_b, bytes);
    cudaMalloc(&d_c, bytes);
    int i;
    for(i = 0; i < n; i++){
        h_a[i] = sin(i)*sin(i);
        h_b[i] = cos(i)*cos(i+1);
    cudaMemcpy(d_a, h_a, bytes, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, h_b, bytes, cudaMemcpyHostToDevice);
    vecAdd <<<1, 1024>>> (d_a, d_b, d_c, n);
    cudaMemcpy(h_c, d_c, bytes, cudaMemcpyDeviceToHost);
    float sum = 0;
    for(i = 0; i < n; i++)
        sum += h_c[i] - h_a[i] - h_b[i];
    printf("final diff: %f\n", sum/n);
    return 0;

Is it related to the sampling of nvprof? One time I get 384 bytes. Sometimes I even got 0 bytes.