Hi again,
After doing some more analysis, it appears that the cudaMemcpy leak does stop at some point. The following simple example which was mostly taken from the programming guide also shows the leakage and it seems to be about 4 KiB.
Has there been any progress yet? Because for our project we’re leaking 200MiB in about 10 hrs which is very concerning. We are performing a camera demosaic and distortion correction with remap functions using textures and surfaces. I hope we are messing up somewhere…
transform.cu
#include <math.h>
#include <cuda_runtime_api.h>
#include <curand_kernel.h>
#define BLOCK 32
static texture<float, cudaTextureType2D, cudaReadModeElementType> gTex;
/**
* Rotates an image
*/
__global__ void transform(float *dst, int w, int h, float a)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x >= w || y >= h)
return;
float u = x / (float)w;
float v = y / (float)h;
u -= 0.5f;
v -= 0.5f;
float tu = u * cosf(a) - v * sinf(a) + 0.5f;
float tv = v * cosf(a) + u * sinf(a) + 0.5f;
dst[y*w+x] = tex2D(gTex, tu, tv);
}
void process(const float *src, float *dst, int w, int h, float a)
{
cudaChannelFormatDesc chan_desc = cudaCreateChannelDesc<float>();
cudaArray *cu_array;
cudaMallocArray(&cu_array, &chan_desc, w, h);
cudaMemcpyToArray(cu_array, 0, 0, src, w*h*sizeof(float), cudaMemcpyHostToDevice);
gTex.addressMode[0] = cudaAddressModeBorder;
gTex.addressMode[1] = cudaAddressModeBorder;
gTex.filterMode = cudaFilterModeLinear;
gTex.normalized = true;
cudaBindTextureToArray(gTex, cu_array, chan_desc);
float *output;
cudaMalloc(&output, w*h*sizeof(float));
dim3 dimBlock(BLOCK, BLOCK);
dim3 dimGrid((w + dimBlock.x - 1) / dimBlock.x,
(h + dimBlock.y - 1) / dimBlock.y);
transform<<<dimGrid, dimBlock>>>(output, w, h, a);
cudaMemcpy(dst, output, w*h*sizeof(float), cudaMemcpyDeviceToHost);
cudaFreeArray(cu_array);
cudaFree(output);
}
main.cpp
#include <stdio.h>
#include <sys/types.h>
#include <unistd.h>
#include <stdlib.h>
#include "transform.h"
using namespace std;
int main(void)
{
printf("pid: %i\n", getpid());
int w = 64;
int h = 64;
float img[w*h];
float res[w*h];
for (int i = 0; i < w*h; i++)
img[i] = 1.0f;
int N = 10000000;
for (int i = 0; i < N; i++)
process(img, res, w, h, i);
for (int i = 0; i < h; i++)
{
for (int j = 0; j < w; j++)
printf("%i ", int(res[i*w+j]));
printf("\n");
}
return 0;
}