CUDA Timing Question

I am trying to measure a kernel execution time. I have a NxK matrix. The kernel function computes the sum of the elements of a given column of the matrix, (one thread for each K). For a given N and K, the kernel is executed 10000 times. After executing the code I found that (irrespective of K) the execution time for N = 500 is always smaller than the execution time for N = 400. Does anybody has an idea as to what might be the reason for this? Below is the code. Thanks for the help!

#include <cuda.h>
#include

// prototypes
global void Tstat(const float *X, float Y, const int N, const int K){
int idx = blockIdx.x
blockDim.x + threadIdx.x;

if(idx < K){
// compute start of X segment
int iter = idx*N;

// compute sum of elements
float val = 0.0;
for (int n = 0; n < N; n++)
  val += X[iter+n];

Y[idx] = val;

}
}

int main(){

int Nnum = 10;
int Nmin = 100;
int Nstep = 100;

int Knum = 6;
int Kmin = 10000;
int Kstep = 10000;

int *Narray;
Narray = (int )malloc(Nnumsizeof(int));
int *Karray;
Karray = (int )malloc(Knumsizeof(int));
float Time;
Time = (float )malloc(NnumKnum
sizeof(float));

// start time profiling
int B = 10000;
for (int it_n = 0; it_n < Nnum; it_n++){
int N = Nmin + it_n*Nstep;
Narray[it_n] = N;

for (int it_k = 0; it_k < Knum; it_k++){
  int K = Kmin + it_k*Kstep;
  Karray[it_k] = K;

  // allocate memory on host  
  float *X_h;                                                                                                                              
  X_h = (float *)malloc(N*K*sizeof(float));

  // generate X_h
  int count = 0;
  for (int n = 0; n < N; n++){
for (int k = 0; k < K; k++){
  X_h[count] = 0.0;
  count++;
}
  }

  // allocate memory on device    
  float *X_d;
  cudaError_t error = cudaMalloc((void **) &X_d, N*K*sizeof(float));
  if (error != cudaSuccess){
std::cout << "Failed at malloc:X_d.\n";
return 0;
  }
  error = cudaMemcpy(X_d, X_h, sizeof(float)*N*K, cudaMemcpyHostToDevice);
  if (error != cudaSuccess){
std::cout << "Failed at memcpy:X_d.\n";
return 0;
  }

  float *Y_d;                                                                                                                                                 
  error = cudaMalloc((void **) &Y_d, K*sizeof(float));
  if (error != cudaSuccess){
std::cout << "Failed at malloc:Y_d.\n";
return 0;
  }

  // configuration
  int blockSize = 256;
  int nBlocks = K/blockSize + (K%blockSize == 0?0:1);

  time_t start; start = time(NULL);
                                                                                                                      
  // start execution
  for (int b = 0; b < B; b++){
// compute test statistic
Tstat <<<nBlocks, blockSize>>> (X_d, Y_d, N, K);
  }

  time_t end; end = time(NULL);
  Time[it_n*Knum+it_k] = end - start; // in seconds

  // cleanup
  free(X_h);

  error = cudaFree(X_d);
  if (error != cudaSuccess){
std::cout << "Failed at free:X_d.\n";
return 0;
  }
 
  error = cudaFree(Y_d);
  if (error != cudaSuccess){
std::cout << "Failed at free:Y_d.\n";
return 0;
  }
}

}

// write Narray, Karray and Time to file
FILE *file_p = NULL;
file_p = fopen(“/home/icuda_data/N_K_Time.txt”, “w”);
for (int i = 0; i < Nnum; i++){
for (int j = 0; j < Knum; j++)
fprintf(file_p, “%d %d %.1f\n”, Narray[i], Karray[j], Time[i*Knum+j]);
}
fclose(file_p);

return(0);
}