Hi-
I’m fairly inexperienced with CUDA so I hope this is an easy fix. However, I really can’t figure it out. I have a kernel I call from a for loop that works fine for small data sets but, for larger data sets, gets slower over time and even totally freezes my computer.
In brief, here is a sketch of the host side of things with many of the repetitive parameters removed:
//The function that calls the kernel, called iteratively after a few other changes are made each cycle in the main program
void runTimeStep(const float* h_T1, const float* h_Qr, const char* h_Domain, ..., float* h_T0){
float* d_kappa, ...;
...
char* d_Domain;
//allocate parameters
cutilSafeCall( cudaMalloc( (void**) &d_kappa, sizeof(float)*kappa.length()));
//and some more parameters...
//allocate arrays as 1D
cutilSafeCall( cudaMalloc((void**) &d_T1, sizeof(float)*(Nx*Ny*Nz)));
...
//copy params
cutilSafeCall( cudaMemcpy( d_kappa, h_kappa, sizeof(float)*kappa.length(), cudaMemcpyHostToDevice));
...
//copy 1D arrays
cutilSafeCall( cudaMemcpy( d_T1, h_T1, sizeof(float)*(Nx*Ny*Nz), cudaMemcpyHostToDevice));
...
//output
float* d_T0;
cutilSafeCall( cudaMalloc((void**) &d_T0, sizeof(float)*(Nx*Ny*Nz)));
cutilSafeCall( cudaMemcpy( d_T0, h_T0, sizeof(float)*(Nx*Ny*Nz), cudaMemcpyHostToDevice));
// set up execution parameters
dim3 grid( (Nx/16)+1, (Ny/28)+1, 1);
dim3 threads( 16, 28, 1);
onetimestep<<<grid,threads>>> (d_T1,d_Qr,d_Domain,d_dh2,d_kappa, ..., d_T0);
cudaMemcpy( h_T0, d_T0, sizeof(float)*(Nx*Ny*Nz), cudaMemcpyDeviceToHost); //result!
cudaFree(d_kappa);
cudaFree(d_T1);
...
cudaFree(d_T0);
cutilDeviceReset();
}
…and here is the kernel itself, again modified for brevity:
#include <stdio.h>
__global__ void onetimestep ( const float* T1, ..., float* T2 ) {
int i = blockIdx.x*blockDim.x + threadIdx.x;
int j = blockIdx.y*blockDim.y + threadIdx.y;
int ind,...;
...
float GradPlusLap;
float Perfusion;
for(int k=0; k < (*Nz); k++) {
ind = k*Nxy + j * (*Ny) + i;
if(i > 1 && j > 1 && k > 1 && i < (*Nx)-2 && j < (*Ny)-2 && k < (*Nz)-2) {
GradPlusLap = ...;
Perfusion = ...;
T2[ind] = T1[ind] + *dt * (GradPlusLap - Perfusion + (...)) /
(...);
}
else{
T2[ind] = T1[ind];
}//End if
} //End k
}
I’m confident that the kernel itself works as it gives me the proper result for small data sets. Do I have a memory leak? I know I cudaFree() all my device variables, and do a cutilDeviceReset() following each call.
Any help would be much appreciated.