cudaMemcpy leaks on TK1

Error323 · January 28, 2016, 2:02pm

Hi Everyone,

I’ll cut straight to it. I wrote the most basic cuda program that shows a memory leak on the Jetson TK1. It doesn’t show memory leaks on my desktop machines with regular NVIDIA gpus.

https://github.com/Error323/cudamemleak

The program leaks exactly 4 bytes per operation according to linux itself. See the program and it’s README.md for my methodology. I ran this on 3 different Jetson TK1’s and they all experience the leak. All TK1’s ran the same kernel (3.10.40-grinch-21.3.4).

Could those with a TK1 please verify and maybe explain what’s going on or what I’m doing wrong?

Kind regards,

Error323

Error323 · February 1, 2016, 12:33pm

The code I’m running is performing cudaMemcpy’s in a loop. And I’m checking the memory usage by obtaining its $PID:

sudo echo 0 $(awk '/Private/ {print "+", $2}' /proc/$PID/smaps) | bc

This all the code, compile with nvcc:

#include <iostream>
#include <vector>

#include <sys/types.h>
#include <unistd.h>
#include <stdlib.h>

#include <cuda_runtime_api.h>

#define cudaSafeCall(expr) gpu::___cudaSafeCall(expr, __FILE__, __LINE__, __func__)

namespace gpu
{
void error(const char *error_string, const char *file, const int line, const char *func = "");

static inline void ___cudaSafeCall(cudaError_t err, const char *file, const int line, const char *func = "")
{
    if (cudaSuccess != err)
        gpu::error(cudaGetErrorString(err), file, line, func);
}

void error(const char *error_string, const char *file, const int line, const char *func)
{
  std::cerr << "\n\nCudaError: " << error_string << std::endl;
  std::cerr << "  File:  " << file << std::endl;
  std::cerr << "  Line:  " << line << std::endl;
  std::cerr << "  Func:  " << func << std::endl;
  exit(1);
}

int memOp(int *data, int n)
{
  int *d_data;

  size_t data_size = n * sizeof(int);
  cudaSafeCall(cudaMalloc((void**)&d_data, data_size));

  cudaSafeCall(cudaMemcpy(d_data, data, data_size, cudaMemcpyHostToDevice));
  cudaSafeCall(cudaDeviceSynchronize());

  for (int i = 0; i < 1000000; i++)
  {
    cudaSafeCall(cudaMemcpy(data, d_data, data_size, cudaMemcpyDeviceToHost));
    cudaSafeCall(cudaDeviceSynchronize());
  }

  cudaSafeCall(cudaFree(d_data));
  cudaSafeCall(cudaDeviceSynchronize());

  return 0;
}
}

using namespace std;

int main(int argc, char *argv[])
{
  std::cout << "pid: " << getpid() << std::endl << std::endl;
  vector<int> data(512,0);
  gpu::memOp(data.data(), data.size());
  return 0;
}

kayccc · February 2, 2016, 6:55am

Hi Error323,

Thanks for reporting the issue, we are currently investigating the case and we’ll let you know when we have an update.

Error323 · February 17, 2016, 3:51pm

Hi again,

After doing some more analysis, it appears that the cudaMemcpy leak does stop at some point. The following simple example which was mostly taken from the programming guide also shows the leakage and it seems to be about 4 KiB.

Has there been any progress yet? Because for our project we’re leaking 200MiB in about 10 hrs which is very concerning. We are performing a camera demosaic and distortion correction with remap functions using textures and surfaces. I hope we are messing up somewhere…

transform.cu

#include <math.h>
#include <cuda_runtime_api.h>
#include <curand_kernel.h>

#define BLOCK 32

static texture<float, cudaTextureType2D, cudaReadModeElementType> gTex;

/**
 * Rotates an image
 */
__global__ void transform(float *dst, int w, int h, float a)
{
  int x = blockIdx.x * blockDim.x + threadIdx.x;
  int y = blockIdx.y * blockDim.y + threadIdx.y;

  if (x >= w || y >= h)
    return;

  float u = x / (float)w;
  float v = y / (float)h;

  u -= 0.5f;
  v -= 0.5f;
  float tu = u * cosf(a) - v * sinf(a) + 0.5f;
  float tv = v * cosf(a) + u * sinf(a) + 0.5f;

  dst[y*w+x] = tex2D(gTex, tu, tv);
}

void process(const float *src, float *dst, int w, int h, float a)
{
  cudaChannelFormatDesc chan_desc = cudaCreateChannelDesc<float>();
  cudaArray *cu_array;
  cudaMallocArray(&cu_array, &chan_desc, w, h);
  cudaMemcpyToArray(cu_array, 0, 0, src, w*h*sizeof(float), cudaMemcpyHostToDevice);

  gTex.addressMode[0] = cudaAddressModeBorder; 
  gTex.addressMode[1] = cudaAddressModeBorder;
  gTex.filterMode = cudaFilterModeLinear;
  gTex.normalized = true;

  cudaBindTextureToArray(gTex, cu_array, chan_desc);
  float *output;
  cudaMalloc(&output, w*h*sizeof(float));
  dim3 dimBlock(BLOCK, BLOCK);
  dim3 dimGrid((w + dimBlock.x - 1) / dimBlock.x,
               (h + dimBlock.y - 1) / dimBlock.y);

  transform<<<dimGrid, dimBlock>>>(output, w, h, a);
  cudaMemcpy(dst, output, w*h*sizeof(float), cudaMemcpyDeviceToHost);

  cudaFreeArray(cu_array);
  cudaFree(output);
}

main.cpp

#include <stdio.h>
#include <sys/types.h>
#include <unistd.h>
#include <stdlib.h>

#include "transform.h"

using namespace std;

int main(void)
{
  printf("pid: %i\n", getpid());

  int w = 64;
  int h = 64;
  float img[w*h];
  float res[w*h];

  for (int i = 0; i < w*h; i++)
    img[i] = 1.0f;

  int N = 10000000;
  for (int i = 0; i < N; i++)
    process(img, res, w, h, i);

  for (int i = 0; i < h; i++)
  {
    for (int j = 0; j < w; j++)
      printf("%i ", int(res[i*w+j]));
    printf("\n");
  }
  return 0;
}

kayccc · February 24, 2016, 9:04am

Hi Error323,

Could you help to confirm the issue is still there after upgrade the SDK to R21.4?

Thanks

Topic		Replies	Views
FAO: Nvidia Engineers:- Memory Leak in cudaMemcpyAsync Only occurs on Host To Device memory transfer CUDA Programming and Performance	4	5866	August 18, 2010
Got out of memory from cudaMemcpy CUDA Programming and Performance	13	3880	January 28, 2022
Zero Copy vs. CudaMemcpy on Jetson TK1 Jetson TK1	4	1512	May 18, 2016
Huge memory leak CUDA Programming and Performance	16	5584	July 27, 2016
cudaMemcpy error, all data not being transferred. but cudaMemcpy returns cudaSuccess CUDA Programming and Performance	5	15193	August 7, 2019
Why the access speed of memory allocated by cudaMallocHost is so slow? Jetson TX2 cuda	8	693	October 18, 2021
cudaMemcpy problem problem with using cudaMemcpy CUDA Programming and Performance	12	5888	January 5, 2010
Weirdness at the 2 gigabyte boundary Jetson TX1	4	874	July 5, 2016
Kernel lunch overhead increases significantly (10x) when using unified memory on TK1 and TX1 Jetson TK1	5	3245	August 31, 2018
Why it is so slow to use cudamemcpy（cudaMemcpyHostToHost）on tx2 Jetson TX2 cuda	5	1166	October 18, 2021

cudaMemcpy leaks on TK1

Related topics