I am trying to measure a kernel execution time. I have a NxK matrix. The kernel function computes the sum of the elements of a given column of the matrix, (one thread for each K). For a given N and K, the kernel is executed 10000 times. After executing the code I found that (irrespective of K) the execution time for N = 500 is always smaller than the execution time for N = 400. Does anybody has an idea as to what might be the reason for this? Below is the code. Thanks for the help!
// prototypes
global void Tstat(const float *X, float Y, const int N, const int K){
int idx = blockIdx.xblockDim.x + threadIdx.x;
if(idx < K){
// compute start of X segment
int iter = idx*N;
// compute sum of elements
float val = 0.0;
for (int n = 0; n < N; n++)
val += X[iter+n];
Y[idx] = val;
}
}
int main(){
int Nnum = 10;
int Nmin = 100;
int Nstep = 100;
int Knum = 6;
int Kmin = 10000;
int Kstep = 10000;
int *Narray;
Narray = (int )malloc(Nnumsizeof(int));
int *Karray;
Karray = (int )malloc(Knumsizeof(int));
float Time;
Time = (float )malloc(NnumKnumsizeof(float));
// start time profiling
int B = 10000;
for (int it_n = 0; it_n < Nnum; it_n++){
int N = Nmin + it_n*Nstep;
Narray[it_n] = N;
for (int it_k = 0; it_k < Knum; it_k++){
int K = Kmin + it_k*Kstep;
Karray[it_k] = K;
// allocate memory on host
float *X_h;
X_h = (float *)malloc(N*K*sizeof(float));
// generate X_h
int count = 0;
for (int n = 0; n < N; n++){
for (int k = 0; k < K; k++){
X_h[count] = 0.0;
count++;
}
}
// allocate memory on device
float *X_d;
cudaError_t error = cudaMalloc((void **) &X_d, N*K*sizeof(float));
if (error != cudaSuccess){
std::cout << "Failed at malloc:X_d.\n";
return 0;
}
error = cudaMemcpy(X_d, X_h, sizeof(float)*N*K, cudaMemcpyHostToDevice);
if (error != cudaSuccess){
std::cout << "Failed at memcpy:X_d.\n";
return 0;
}
float *Y_d;
error = cudaMalloc((void **) &Y_d, K*sizeof(float));
if (error != cudaSuccess){
std::cout << "Failed at malloc:Y_d.\n";
return 0;
}
// configuration
int blockSize = 256;
int nBlocks = K/blockSize + (K%blockSize == 0?0:1);
time_t start; start = time(NULL);
// start execution
for (int b = 0; b < B; b++){
// compute test statistic
Tstat <<<nBlocks, blockSize>>> (X_d, Y_d, N, K);
}
time_t end; end = time(NULL);
Time[it_n*Knum+it_k] = end - start; // in seconds
// cleanup
free(X_h);
error = cudaFree(X_d);
if (error != cudaSuccess){
std::cout << "Failed at free:X_d.\n";
return 0;
}
error = cudaFree(Y_d);
if (error != cudaSuccess){
std::cout << "Failed at free:Y_d.\n";
return 0;
}
}
}
// write Narray, Karray and Time to file
FILE *file_p = NULL;
file_p = fopen(“/home/icuda_data/N_K_Time.txt”, “w”);
for (int i = 0; i < Nnum; i++){
for (int j = 0; j < Knum; j++)
fprintf(file_p, “%d %d %.1f\n”, Narray[i], Karray[j], Time[i*Knum+j]);
}
fclose(file_p);
return(0);
}