Dynamic memory allocation in kernel gradually slow down.
I am using CUDA 8 and Tesla K80.
Here is my code.
#include <stdio.h>
__global__
void mallocInKernel(char **a) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
a[idx] = new char[32];
//delete[] a[idx];
}
int main() {
dim3 grid(32, 1);
dim3 block(1024, 1);
size_t heap_size = 1024 * 1024 * 1024;
cudaDeviceSetLimit(cudaLimitMallocHeapSize, heap_size);
char **a;
cudaMalloc(&a, sizeof(char*) * grid.x * block.x);
cudaMemset(a, 0, sizeof(char*) * grid.x * block.x);
for (int i = 0; i < 100; ++i) {
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
mallocInKernel<<<grid, block>>>(a);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
float elapsed_time_ms;
cudaEventElapsedTime(&elapsed_time_ms, start, stop);
printf("time: %8.2f ms\n", elapsed_time_ms);
cudaEventDestroy(start);
cudaEventDestroy(stop);
}
cudaFree(a);
return 0;
}
I compiled and ran the above code, I got the following output.
time: 16.56 ms
time: 42.68 ms
time: 97.76 ms
time: 116.99 ms
time: 102.60 ms
time: 121.56 ms
.
.
.
time: 771.40 ms
If I uncomment the line 7 (delete a[idx]) in the above code, elapsed times of the kernel stabilizes.
Please let me know if you have any suggestion or comment. I appreciate your help.