cudaFree() error + loop

Hi,

I want to run a kernel in a loop. First I tried to allocate memory each time before the kernel call and free it after finishing. The Problem is that the cudaFree() does not work properly so my graphic memory becomes full after a short time.

So I tried to allocate the memory just before the first kernel call and reuse it in all further kernel calls. That works but the problem is that every run of the kernel takes more time than the preceding one. The test program is the clock example of the SDK expanded by a loop:

main.c

[codebox]

cudaStart( stderr );

int i;

for( i = 0; i < 100; i++ ) {

    printf( "Star no. %d\n", i );

    run();

}

cudaExit();[/codebox]

cudaStart() and cudaExit() just call cudaSetDevice( 0 ) and cudaThreadExit(), respectively.

clock.cu:

[codebox]

#define NUM_BLOCKS 64

#define NUM_THREADS 256

extern “C” void run()

{

static float * dinput = NULL;

static float * doutput = NULL;

static clock_t * dtimer = NULL;

static int first = 0;

clock_t timer[NUM_BLOCKS * 2];

float input[NUM_THREADS * 2];

for (int i = 0; i < NUM_THREADS * 2; i++)

{

    input[i] = (float)i;

}

if( !first )

{

    cutilSafeCall(cudaMalloc((void**)&dinput, sizeof(float) * NUM_THREADS * 2));

    cutilSafeCall(cudaMalloc((void**)&doutput, sizeof(float) * NUM_BLOCKS));

    cutilSafeCall(cudaMalloc((void**)&dtimer, sizeof(clock_t) * NUM_BLOCKS * 2));

    first = 1;

}

cutilSafeCall(cudaMemcpy(dinput, input, sizeof(float) * NUM_THREADS * 2, cudaMemcpyHostToDevice));

timedReduction<<<NUM_BLOCKS, NUM_THREADS, sizeof(float) * 2 * NUM_THREADS>>>(dinput, doutput, dtimer);

//cutilSafeCall(cudaMemcpy(output, doutput, sizeof(float) * NUM_BLOCKS, cudaMemcpyDeviceToHost));

cutilSafeCall(cudaMemcpy(timer, dtimer, sizeof(clock_t) * NUM_BLOCKS * 2, cudaMemcpyDeviceToHost));

// cutilSafeCall(cudaFree(dinput));

// cutilSafeCall(cudaFree(doutput));

// cutilSafeCall(cudaFree(dtimer));

// Compute the difference between the last block end and the first block start.

clock_t minStart = timer[0];

clock_t maxEnd = timer[NUM_BLOCKS];

for (int i = 1; i < NUM_BLOCKS; i++)

{

    minStart = timer[i] < minStart ? timer[i] : minStart;

    maxEnd = timer[NUM_BLOCKS+i] > maxEnd ? timer[NUM_BLOCKS+i] : maxEnd;

}

printf(“time = %d\n”, maxEnd - minStart);

fflush( 0 );

cudaThreadSynchronize();

}[/codebox]

clock_kernel.cu:

[codebox]

global static void timedReduction(const float * input, float * output, clock_t * timer)

{

// __shared__ float shared[2 * blockDim.x];

extern __shared__ float shared[];

const int tid = threadIdx.x;

const int bid = blockIdx.x;

if (tid == 0) timer[bid] = clock();

// Copy input.

shared[tid] = input[tid];

shared[tid + blockDim.x] = input[tid + blockDim.x];

// Perform reduction to find minimum.

for(int d = blockDim.x; d > 0; d /= 2)

{

    __syncthreads();

if (tid < d)

    {

        float f0 = shared[tid];

        float f1 = shared[tid + d];

if (f1 < f0) {

            shared[tid] = f1;

        }

    }

}

// Write result.

if (tid == 0) output[bid] = shared[0];

__syncthreads();

if (tid == 0) timer[bid+gridDim.x] = clock();

}[/codebox]

The first kernel run takes about 76000 clock cycles, the 100th kernel run takes about 200000 clock cycles.

Can anybody tell me where the error is? What’s wrong with cudaFree() ?

I use a Geforce 9500GT

The proplem with the increasing calculation time does not occur if I put a sleep() of 1s after each call. It seems to be that cuda needs some time for cleaning up or something like that. Has someone else such a problem with loops?