I have encountered a problem of which the cuda thread local array memory is not released after kernel execution.
I am using CUDA 9.1 (compute_61,sm_61) on Windows 10.
Below is the code snippet to query device memory:
void query_device_memory(const char * str)
{
size_t free_byte;
size_t total_byte;
auto cuda_status = cudaMemGetInfo(&free_byte, &total_byte);
if (cudaSuccess != cuda_status)
{
printf("Error: cudaMemGetInfo fails, %s \n", cudaGetErrorString(cuda_status));
}
double free_db = (double)free_byte;
double total_db = (double)total_byte;
double used_db = total_db - free_db;
std::cout << "GPU memory usage: used = " << used_db / 1024.0 / 1024.0 << std::endl;
}
Below is the dummy kernel used to illustrate the problem:
__global__ void dummy_kernel(float * dumm)
{
auto const index = threadIdx.x;
int const val = *dumm;
float buff_1[16384] = {};
for (size_t i = 0; i < 16384; i++)
{
buff_1[(index * val) % 16384] += *dumm;
}
*dumm = buff_1[val % 16384];
}
int main()
{
query_device_memory("before:");
float * d_dumm;
CHECK(cudaMalloc(&d_dumm, sizeof(float)));
CHECK(cudaMemset(d_dumm, 0, sizeof(float)));
query_device_memory("after malloc:");
dummy_kernel << <1000, 32 >> > (d_dumm);
CHECK(cudaGetLastError());
CHECK(cudaDeviceSynchronize());
query_device_memory("after:");
float answer = 0.f;
CHECK(cudaMemcpy(&answer, d_dumm, sizeof(float), cudaMemcpyDeviceToHost));
CHECK(cudaDeviceReset());
return 0;
}
The output of the program:
GPU memory usage: used = 2204.38
GPU memory usage: used = 2206.38
GPU memory usage: used = 4878.38