Why does dynamic global memory allocation gradually slow down?

Dynamic memory allocation in kernel gradually slow down.
I am using CUDA 8 and Tesla K80.

Here is my code.

#include <stdio.h>

__global__
void mallocInKernel(char **a) {
  int idx = blockIdx.x * blockDim.x + threadIdx.x;
  a[idx] = new char[32];
  //delete[] a[idx];
}

int main() {
  dim3 grid(32, 1);
  dim3 block(1024, 1);

  size_t heap_size = 1024 * 1024 * 1024;
  cudaDeviceSetLimit(cudaLimitMallocHeapSize, heap_size);

  char **a;
  cudaMalloc(&a, sizeof(char*) * grid.x * block.x);
  cudaMemset(a, 0, sizeof(char*) * grid.x * block.x);

  for (int i = 0; i < 100; ++i) {
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    cudaEventRecord(start, 0);
    mallocInKernel<<<grid, block>>>(a);
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);

    float elapsed_time_ms;
    cudaEventElapsedTime(&elapsed_time_ms, start, stop);
    printf("time: %8.2f ms\n", elapsed_time_ms);

    cudaEventDestroy(start);
    cudaEventDestroy(stop);
  }

  cudaFree(a);

  return 0;
}

I compiled and ran the above code, I got the following output.

time:    16.56 ms
time:    42.68 ms
time:    97.76 ms
time:   116.99 ms
time:   102.60 ms
time:   121.56 ms
.
.
.
time:   771.40 ms

If I uncomment the line 7 (delete a[idx]) in the above code, elapsed times of the kernel stabilizes.
Please let me know if you have any suggestion or comment. I appreciate your help.