Jetson TX1 cuda kernel launch overhead

Hi , I am running the sample row kernel convolution code in CUDA with two different approaches for multiple iterations.

Approach 1 : 2 blocks of 256 size over the width and iterating throughout the height for every thread.

Approach 2 : the sample cuda row convolution code for seperable filters

global void row_kernel(unsigned char *image , unsigned char *blurredx , int width ,int height)
{
const int kernel[5]={1,4,6,4,1};
shared unsigned char sh_mem[ROWS_BLOCK_DIM_Y][(ROWS_RESULT_STEPS + 2 * ROWS_HALO_STEPS) * ROWS_BLOCK_DIM_X];

const int baseX = (blockIdx.x * ROWS_RESULT_STEPS - ROWS_HALO_STEPS) * ROWS_BLOCK_DIM_X + threadIdx.x;
const int baseY = blockIdx.y * ROWS_BLOCK_DIM_Y + threadIdx.y;

image += baseY * width + baseX;
blurredx += baseY * width + baseX;

#pragma unroll
for (int i = ROWS_HALO_STEPS; i < ROWS_HALO_STEPS + ROWS_RESULT_STEPS; i++)
{
sh_mem[threadIdx.y][threadIdx.x + i * ROWS_BLOCK_DIM_X] = image[i * ROWS_BLOCK_DIM_X];
}
#pragma unroll
for (int i = 0; i < ROWS_HALO_STEPS; i++)
{
sh_mem[threadIdx.y][threadIdx.x + i * ROWS_BLOCK_DIM_X] = (baseX >= -i * ROWS_BLOCK_DIM_X) ? image[i * ROWS_BLOCK_DIM_X] : 0;
}

#pragma unroll
for (int i = ROWS_HALO_STEPS + ROWS_RESULT_STEPS; i < (ROWS_HALO_STEPS + ROWS_RESULT_STEPS + ROWS_HALO_STEPS); i++)
{
sh_mem[threadIdx.y][threadIdx.x + i * ROWS_BLOCK_DIM_X] = (width - baseX > i * ROWS_BLOCK_DIM_X) ? image[i * ROWS_BLOCK_DIM_X] : 0;
}

__syncthreads();

#pragma unroll
for (int i = ROWS_HALO_STEPS; i < (ROWS_HALO_STEPS + ROWS_RESULT_STEPS); i++)
{
int sum = 0;

#pragma unroll
for (int j = -2; j <= 2; j++)
{
sum += kernel[2 + j] * sh_mem[threadIdx.y][threadIdx.x + i * ROWS_BLOCK_DIM_X + j];
}
sum = sum >> 4;
blurredx[i * ROWS_BLOCK_DIM_X] = sum;
}
}

The first approach though inefficient gives a standard kernel launch time of around 40-50 us on TX1
However in the second approach the kernel launch time fluctuates a lot from 50 us to 400us as measured using NVPROF.
Note : I have set the clocks to maximum frequency on cpu and gpu.
I am testing on a 640x480 image . Below is how i timestamp for Approach
#define ROWS_BLOCK_DIM_X 32
#define ROWS_BLOCK_DIM_Y 8
#define ROWS_RESULT_STEPS 5// UPDATES PER THREAD
#define ROWS_HALO_STEPS 1
dim3 blocks(width/(ROWS_RESULT_STEPS*ROWS_BLOCK_DIM_X),height/ROWS_BLOCK_DIM_Y);
dim3 threads(ROWS_BLOCK_DIM_X, ROWS_BLOCK_DIM_Y);
int iter=100;
long double time=0.0;
struct timeval t1,t2;
for(int i=0;i<iter;i++)
{
gettimeofday(&t1,NULL);
row_kernel<<<blocks,threads>>>(d_image,blurredx,width,height);
cudaDeviceSynchronize();
gettimeofday(&t2,NULL);
time += ((t2.tv_usec-t1.tv_usec)/(1000.0));
}
time/=iter;
std::cout<<(time)<<" ms\n";
time=0.0;
Any help would be appreciated.