cudaMemcpy leaks on TK1

Hi Everyone,

I’ll cut straight to it. I wrote the most basic cuda program that shows a memory leak on the Jetson TK1. It doesn’t show memory leaks on my desktop machines with regular NVIDIA gpus.

The program leaks exactly 4 bytes per operation according to linux itself. See the program and it’s for my methodology. I ran this on 3 different Jetson TK1’s and they all experience the leak. All TK1’s ran the same kernel (3.10.40-grinch-21.3.4).

Could those with a TK1 please verify and maybe explain what’s going on or what I’m doing wrong?

Kind regards,


The code I’m running is performing cudaMemcpy’s in a loop. And I’m checking the memory usage by obtaining its $PID:

sudo echo 0 $(awk '/Private/ {print "+", $2}' /proc/$PID/smaps) | bc

This all the code, compile with nvcc:

#include <iostream>
#include <vector>

#include <sys/types.h>
#include <unistd.h>
#include <stdlib.h>

#include <cuda_runtime_api.h>

#define cudaSafeCall(expr) gpu::___cudaSafeCall(expr, __FILE__, __LINE__, __func__)

namespace gpu
void error(const char *error_string, const char *file, const int line, const char *func = "");

static inline void ___cudaSafeCall(cudaError_t err, const char *file, const int line, const char *func = "")
    if (cudaSuccess != err)
        gpu::error(cudaGetErrorString(err), file, line, func);

void error(const char *error_string, const char *file, const int line, const char *func)
  std::cerr << "\n\nCudaError: " << error_string << std::endl;
  std::cerr << "  File:  " << file << std::endl;
  std::cerr << "  Line:  " << line << std::endl;
  std::cerr << "  Func:  " << func << std::endl;

int memOp(int *data, int n)
  int *d_data;

  size_t data_size = n * sizeof(int);
  cudaSafeCall(cudaMalloc((void**)&d_data, data_size));

  cudaSafeCall(cudaMemcpy(d_data, data, data_size, cudaMemcpyHostToDevice));

  for (int i = 0; i < 1000000; i++)
    cudaSafeCall(cudaMemcpy(data, d_data, data_size, cudaMemcpyDeviceToHost));


  return 0;

using namespace std;

int main(int argc, char *argv[])
  std::cout << "pid: " << getpid() << std::endl << std::endl;
  vector<int> data(512,0);
  gpu::memOp(, data.size());
  return 0;

Hi Error323,

Thanks for reporting the issue, we are currently investigating the case and we’ll let you know when we have an update.

Hi again,

After doing some more analysis, it appears that the cudaMemcpy leak does stop at some point. The following simple example which was mostly taken from the programming guide also shows the leakage and it seems to be about 4 KiB.

Has there been any progress yet? Because for our project we’re leaking 200MiB in about 10 hrs which is very concerning. We are performing a camera demosaic and distortion correction with remap functions using textures and surfaces. I hope we are messing up somewhere…

#include <math.h>
#include <cuda_runtime_api.h>
#include <curand_kernel.h>

#define BLOCK 32

static texture<float, cudaTextureType2D, cudaReadModeElementType> gTex;

 * Rotates an image
__global__ void transform(float *dst, int w, int h, float a)
  int x = blockIdx.x * blockDim.x + threadIdx.x;
  int y = blockIdx.y * blockDim.y + threadIdx.y;

  if (x >= w || y >= h)

  float u = x / (float)w;
  float v = y / (float)h;

  u -= 0.5f;
  v -= 0.5f;
  float tu = u * cosf(a) - v * sinf(a) + 0.5f;
  float tv = v * cosf(a) + u * sinf(a) + 0.5f;

  dst[y*w+x] = tex2D(gTex, tu, tv);

void process(const float *src, float *dst, int w, int h, float a)
  cudaChannelFormatDesc chan_desc = cudaCreateChannelDesc<float>();
  cudaArray *cu_array;
  cudaMallocArray(&cu_array, &chan_desc, w, h);
  cudaMemcpyToArray(cu_array, 0, 0, src, w*h*sizeof(float), cudaMemcpyHostToDevice);

  gTex.addressMode[0] = cudaAddressModeBorder; 
  gTex.addressMode[1] = cudaAddressModeBorder;
  gTex.filterMode = cudaFilterModeLinear;
  gTex.normalized = true;

  cudaBindTextureToArray(gTex, cu_array, chan_desc);
  float *output;
  cudaMalloc(&output, w*h*sizeof(float));
  dim3 dimBlock(BLOCK, BLOCK);
  dim3 dimGrid((w + dimBlock.x - 1) / dimBlock.x,
               (h + dimBlock.y - 1) / dimBlock.y);

  transform<<<dimGrid, dimBlock>>>(output, w, h, a);
  cudaMemcpy(dst, output, w*h*sizeof(float), cudaMemcpyDeviceToHost);



#include <stdio.h>
#include <sys/types.h>
#include <unistd.h>
#include <stdlib.h>

#include "transform.h"

using namespace std;

int main(void)
  printf("pid: %i\n", getpid());

  int w = 64;
  int h = 64;
  float img[w*h];
  float res[w*h];

  for (int i = 0; i < w*h; i++)
    img[i] = 1.0f;

  int N = 10000000;
  for (int i = 0; i < N; i++)
    process(img, res, w, h, i);

  for (int i = 0; i < h; i++)
    for (int j = 0; j < w; j++)
      printf("%i ", int(res[i*w+j]));
  return 0;

Hi Error323,

Could you help to confirm the issue is still there after upgrade the SDK to R21.4?