Confused about global memory write cycle

Hi, i’m very confused about cuda global memory write cycles.
kernel source 1 is only global memory write, its cycle is 580 cycle with clock() function.
kernel source 2 is hybrid with some compute, LDG, LDS, STS. while its global memory write cycle is just 122 cycle with clock() function.
run kernel with <<<1,1>>>

source 1

extern "C" __global__ void default_function_kernel315701(void* __restrict__ placeholder, void* __restrict__ placeholder1, void* __restrict__ compute, unsigned int *duration) {
  float compute_local[10] = {3.1, 3.9};
  __shared__ float pad_temp_shared[5292];
  __shared__ float placeholder_shared[50];
  clock_t startTime, endTime;

  startTime = clock();
  for (int yy_inner_inner_inner = 0; yy_inner_inner_inner < 5; ++yy_inner_inner_inner) {
    ((float*)compute)[(((((((int)blockIdx.y) * 168200) + (((int)blockIdx.x) * 1450)) + (yy_inner_inner_inner * 290)) + ((int)threadIdx.x)))] = compute_local[(yy_inner_inner_inner)];
    ((float*)compute)[((((((((int)blockIdx.y) * 168200) + (((int)blockIdx.x) * 1450)) + (yy_inner_inner_inner * 290)) + ((int)threadIdx.x)) + 84100))] = compute_local[((yy_inner_inner_inner + 5))];
  }
  endTime = clock();
  *(duration+3) = endTime - startTime;
}

source 2

extern "C" __global__ void default_function_kernel315701(void* __restrict__ placeholder, void* __restrict__ placeholder1, void* __restrict__ compute, unsigned int *duration) {
  float compute_local[10] = {3.1, 3.9};
  __shared__ float pad_temp_shared[5292];
  __shared__ float placeholder_shared[50];
  clock_t startTime, endTime;
#if 1
  startTime = clock();
  for (int yy_c_init = 0; yy_c_init < 5; ++yy_c_init) {
    compute_local[(yy_c_init)] = 0.000000e+00f;
    compute_local[((yy_c_init + 5))] = 0.000000e+00f;
  }
  for (int ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer_outer = 0; ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer_outer < 19; ++ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer_outer) {
    if (((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer_outer * 290) + ((int)threadIdx.x)) < 5292) {
      pad_temp_shared[(((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer_outer * 290) + ((int)threadIdx.x)))] = (((((3 <= ((((int)blockIdx.x) * 5) + ((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer_outer * 290) + ((int)threadIdx.x)) % 2646) / 294))) && (((((int)blockIdx.x) * 5) + ((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer_outer * 290) + ((int)threadIdx.x)) % 2646) / 294)) < 291)) && (3 <= (((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer_outer * 290) + ((int)threadIdx.x)) % 294))) && ((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer_outer * 290) + ((int)threadIdx.x)) % 294) < 291)) ? ((float*)placeholder)[(((((((((int)blockIdx.y) * 165888) + ((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer_outer * 290) + ((int)threadIdx.x)) / 2646) * 82944)) + (((int)blockIdx.x) * 1440)) + (((((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer_outer * 290) + ((int)threadIdx.x)) % 2646) / 294) * 288)) + (((ax0_ax1_fused_ax2_fused_ax3_fused_outer_outer_outer * 290) + ((int)threadIdx.x)) % 294)) - 867))] : 0.000000e+00f);
    }
  }
  if (((int)threadIdx.x) < 50) {
    placeholder_shared[(((int)threadIdx.x))] = ((float*)placeholder1)[(((((int)blockIdx.y) * 50) + ((int)threadIdx.x)))];
  }
  endTime = clock();
  *duration = endTime - startTime;

  startTime = clock();
  __syncthreads();
  endTime = clock();
  *(duration+1) = endTime - startTime;

  startTime = clock();
  for (int ry_inner = 0; ry_inner < 5; ++ry_inner) {
    for (int rx_inner = 0; rx_inner < 5; ++rx_inner) {
      for (int yy_c = 0; yy_c < 5; ++yy_c) {
        compute_local[(yy_c)] = (compute_local[(yy_c)] + (pad_temp_shared[(((((yy_c * 294) + (ry_inner * 294)) + ((int)threadIdx.x)) + rx_inner))] * placeholder_shared[(((ry_inner * 5) + rx_inner))]));
        compute_local[((yy_c + 5))] = (compute_local[((yy_c + 5))] + (pad_temp_shared[((((((yy_c * 294) + (ry_inner * 294)) + ((int)threadIdx.x)) + rx_inner) + 2646))] * placeholder_shared[((((ry_inner * 5) + rx_inner) + 25))]));
      }
    }
  }
  endTime = clock();
  *(duration+2) = endTime - startTime;
#endif
  startTime = clock();
  for (int yy_inner_inner_inner = 0; yy_inner_inner_inner < 5; ++yy_inner_inner_inner) {
    ((float*)compute)[(((((((int)blockIdx.y) * 168200) + (((int)blockIdx.x) * 1450)) + (yy_inner_inner_inner * 290)) + ((int)threadIdx.x)))] = compute_local[(yy_inner_inner_inner)];
    ((float*)compute)[((((((((int)blockIdx.y) * 168200) + (((int)blockIdx.x) * 1450)) + (yy_inner_inner_inner * 290)) + ((int)threadIdx.x)) + 84100))] = compute_local[((yy_inner_inner_inner + 5))];
  }
  endTime = clock();
  *(duration+3) = endTime - startTime;
}