I’ve run into an interesting problem while working on my newest CUDA kernel. The function accepts an array of image data, then applies a gaussian filter and a gradient of gaussian filter, then outputs the result of each calculation to two separate arrays. For some reason, the time that the function takes to run varies between each execution, it either takes ~.3 seconds or ~.03 seconds. Due to tests of previous algorithms, the expected run speed is the faster one, and I’d like to figure out how I can consistently get it to run at this speed. Here is the kernel code:

```
__global__ static void gaussianGoGCUDA(float * pImageR,float * pImageG, float * pImageB,
float * pImageRGauout, float * pImageGGauout, float * pImageBGauout,
float * pImageRGradout, float * pImageGGradout, float * pImageBGradout,
float * gaus, float * ggx, float * ggy, int height, int width, int win) {
__shared__ float dataR[I_SHARED_SIZE * I_SHARED_SIZE];
__shared__ float dataG[I_SHARED_SIZE * I_SHARED_SIZE];
__shared__ float dataB[I_SHARED_SIZE * I_SHARED_SIZE];
int i,j,k,l,hWin;
int row,col;
float Gau1=0,Gau2=0,Gau3=0;
float Gx1=0,Gx2=0,Gx3=0,Gy1=0,Gy2=0,Gy3=0;
hWin=win>>1;
i=blockIdx.x*blockDim.x + threadIdx.x;
j=blockIdx.y*blockDim.y + threadIdx.y;
int curidx = j * width + i;
//Initialize shared data with corresponding data from image.
//We can use the thread idx numbers to map the image data into the
//correct index of shared memory.
if((i) < width && (j) < height){
dataR[threadIdx.y * I_SHARED_SIZE + threadIdx.x] = pImageR[curidx];
dataG[threadIdx.y * I_SHARED_SIZE + threadIdx.x] = pImageG[curidx];
dataB[threadIdx.y * I_SHARED_SIZE + threadIdx.x] = pImageB[curidx];
//if the current thread is the last row or column, also populate the
//win rows/columns that lie outside the current block.
if(threadIdx.x == 15){
for(int ii = 1; ii < win; ii++){
curidx = j * width + (i+ii);
dataR[threadIdx.y * I_SHARED_SIZE + threadIdx.x+ii] = pImageR[curidx];
dataG[threadIdx.y * I_SHARED_SIZE + threadIdx.x+ii] = pImageG[curidx];
dataB[threadIdx.y * I_SHARED_SIZE + threadIdx.x+ii] = pImageB[curidx];
}
}
if(threadIdx.y == 15){
for(int jj = 1; jj < win; jj++){
curidx = (j+jj) * width + i;
dataR[(threadIdx.y+jj) * I_SHARED_SIZE + threadIdx.x] = pImageR[curidx];
dataG[(threadIdx.y+jj) * I_SHARED_SIZE + threadIdx.x] = pImageG[curidx];
dataB[(threadIdx.y+jj) * I_SHARED_SIZE + threadIdx.x] = pImageB[curidx];
}
}
//If we're in the last thread of the block, we also need to initialize the
//lower corner of shared memory that lies outside the current block
if(threadIdx.y == 15 && threadIdx.x == 15){
//for each row of j, add win columns to shared memory
for(int jj = 1; jj<win; jj++){
for(int ii = 1; ii < win; ii++){
curidx = (j+jj) * width + (i+ii);
dataR[(threadIdx.y+jj) * I_SHARED_SIZE + (threadIdx.x+ii)] = pImageR[curidx];
dataG[(threadIdx.y+jj) * I_SHARED_SIZE + (threadIdx.x+ii)] = pImageG[curidx];
dataB[(threadIdx.y+jj) * I_SHARED_SIZE + (threadIdx.x+ii)] = pImageB[curidx];
}
}
}
}
else{
dataR[0] = -1;
dataG[0] = -1;
dataB[0] = -1;
}
//sync threads to insure all data is loaded.
__syncthreads();
//indexing here needs to be in respect to our shared memory indexes,
//using threadIdx and shared memory dimensions instead of Image dimensions
curidx = threadIdx.y * I_SHARED_SIZE + threadIdx.x;
if((i+win-1) < width && (j+win-1) < height){
for(k=0;k<win;k++){ //Old calculation: (i+k) * 20 + (j+l)
for(l=0;l<win;l++){ // i + (k * 20) + l
Gau1+=(float)dataR[curidx + (k * I_SHARED_SIZE) + l]*gaus[k*win + l];
Gx1+=(float)dataR[curidx + (k * I_SHARED_SIZE) + l]*ggx[k*win + l];
Gy1+=(float)dataR[curidx + (k * I_SHARED_SIZE) + l]*ggy[k*win + l];
Gau2+=(float)dataG[curidx + (k * I_SHARED_SIZE) + l]*gaus[k*win + l];
Gx2+=(float)dataG[curidx + (k * I_SHARED_SIZE) + l]*ggx[k*win + l];
Gy2+=(float)dataG[curidx + (k * I_SHARED_SIZE) + l]*ggy[k*win + l];
Gau3+=(float)dataB[curidx + (k * I_SHARED_SIZE) + l]*gaus[k*win + l];
Gx3+=(float)dataB[curidx + (k * I_SHARED_SIZE) + l]*ggx[k*win + l];
Gy3+=(float)dataB[curidx + (k * I_SHARED_SIZE) + l]*ggy[k*win + l];
}
}
curidx = (j+hWin) * width + (i+hWin);
//store results into the output arrays.
pImageRGauout[curidx]=Gau1;
pImageGGauout[curidx]=Gau2;
pImageBGauout[curidx]=Gau3;
pImageRGradout[curidx]=(float)sqrt(Gx1*Gx1+Gy1*Gy1);
pImageGGradout[curidx]=(float)sqrt(Gx2*Gx2+Gy2*Gy2);
pImageBGradout[curidx]=(float)sqrt(Gx3*Gx3+Gy3*Gy3);
}
}
```

Thanks for any advice.