CUDA Timing Question

I am trying to measure a kernel execution time. I have a NxK matrix. The kernel function computes the sum of the elements of a given column of the matrix, (one thread for each K). For a given N and K, the kernel is executed 10000 times. After executing the code I found that (irrespective of K) the execution time for N = 500 is always smaller than the execution time for N = 400. Does anybody has an idea as to what might be the reason for this? Below is the code. Thanks for the help!

#include <cuda.h>
#include

// prototypes
global void Tstat(const float *X, float Y, const int N, const int K){
int idx = blockIdx.x
blockDim.x + threadIdx.x;

if(idx < K){
// compute start of X segment
int iter = idx*N;

// compute sum of elements
float val = 0.0;
for (int n = 0; n < N; n++)
val += X[iter+n];

Y[idx] = val;
}
}

int main(){

int Nnum = 10;
int Nmin = 100;
int Nstep = 100;

int Knum = 6;
int Kmin = 10000;
int Kstep = 10000;

int *Narray;
Narray = (int )malloc(Nnumsizeof(int));
int *Karray;
Karray = (int )malloc(Knumsizeof(int));
float Time;
Time = (float )malloc(NnumKnum
sizeof(float));

// start time profiling
int B = 10000;
for (int it_n = 0; it_n < Nnum; it_n++){
int N = Nmin + it_n*Nstep;
Narray[it_n] = N;

for (int it_k = 0; it_k < Knum; it_k++){
int K = Kmin + it_k*Kstep;
Karray[it_k] = K;

// allocate memory on host
float X_h;
X_h = (float )malloc(NK
sizeof(float));

// generate X_h
int count = 0;
for (int n = 0; n < N; n++){
for (int k = 0; k < K; k++){
X_h[count] = 0.0;
count++;
}
}

// allocate memory on device
float X_d;
cudaError_t error = cudaMalloc((void **) &X_d, N
K*sizeof(float));
if (error != cudaSuccess){
std::cout << “Failed at malloc:X_d.\n”;
return 0;
}
error = cudaMemcpy(X_d, X_h, sizeof(float)NK, cudaMemcpyHostToDevice);
if (error != cudaSuccess){
std::cout << “Failed at memcpy:X_d.\n”;
return 0;
}

float Y_d;
error = cudaMalloc((void **) &Y_d, K
sizeof(float));
if (error != cudaSuccess){
std::cout << “Failed at malloc:Y_d.\n”;
return 0;
}

// configuration
int blockSize = 256;
int nBlocks = K/blockSize + (K%blockSize == 0?0:1);

time_t start; start = time(NULL);

// start execution
for (int b = 0; b < B; b++){
// compute test statistic
Tstat <<<nBlocks, blockSize>>> (X_d, Y_d, N, K);
}

time_t end; end = time(NULL);
Time[it_n*Knum+it_k] = end - start; // in seconds

// cleanup
free(X_h);

error = cudaFree(X_d);
if (error != cudaSuccess){
std::cout << “Failed at free:X_d.\n”;
return 0;
}

error = cudaFree(Y_d);
if (error != cudaSuccess){
std::cout << “Failed at free:Y_d.\n”;
return 0;
}
}
}

// write Narray, Karray and Time to file
FILE *file_p = NULL;
file_p = fopen(“/home/icuda_data/N_K_Time.txt”, “w”);
for (int i = 0; i < Nnum; i++){
for (int j = 0; j < Knum; j++)
fprintf(file_p, “%d %d %.1f\n”, Narray[i], Karray[j], Time[i*Knum+j]);
}
fclose(file_p);

return(0);
}

I modify your code by using QTime (QT library) to measure time in

winxp pro 64, vx2005, Tesla C1060, driver 190.38, cuda2.3

  1. random input data in col-major

    for ( k = 0; k < K; k++){

     for ( n = 0; n < N; n++){
    
     	X_h[count] = ((float)rand())/((float)RAND_MAX) - 0.5 ;
    
     	count++;
    
     }// for k
    

    }// for n

  2. use cudaThreadSynchronize() to make sure all kernel finish their work

    this is essential for time-measurement since kernel-launch is Asynchronous

     blockSize = 256;
    
     	nBlocks = K/blockSize + (K%blockSize == 0?0:1);
    
     	for (int b = 0; b < B; b++){
    
     		// compute test statistic
    
     		Tstat <<<nBlocks, blockSize>>> (X_d, Y_d, N, K);
    
     	} 
    
     	cudaThreadSynchronize(); 
    

in my platform, for B = 1000 (execute 1000 times)

N = 400, K = 256, time = 219 (ms)

N = 500, K = 256, time = 266 (ms)

by the way, a thread serves for one colume is not good except you have a lot of columns

in this case, blocksize = 256, this means 1024/256 = 4 blocks in Tesla C1060

each multiprocessor executes one block, so 30 blocks can be execute simultaneously.

each multiprocessor execute a warp (32 threads) of a block, so

30 x 32 = 960 threads can be executed simultaneously.

However k = 256 < 960, it is not full utilized.

I used

// start execution

  for (int b = 0; b < B; b++){

// compute test statistic

Tstat <<<nBlocks, blockSize>>> (X_d, Y_d, N, K);

  }

error = cudaThreadSynchronize();

  if (error != cudaSuccess){

std::cout << "Failed at ThreadSync:Tstat\n";

return 0;

  }

The timing result did not change:

N=400 K=10000 Time=29.0 sec

N=500 K=10000 Time=19.0 sec

I do your experiment again, then

GTX 295, for B = 10000 (execute 1000 times)

N = 400, K = 10000, time = 35281 (ms)
N = 500, K = 10000, time = 50968 (ms)

Tesla C1060, for B = 10000 (execute 1000 times)

N = 400, K = 10000, time = 29016 (ms)
N = 500, K = 10000, time = 24625 (ms)

my tesla C1060 has the same conclusion as yours

I think that your code has no coalesced property

I write another code, use a warp to deal with one column

I declare block = 32 x 16, so one block has 16 warps,

corresponding to 16 columns

each warp compute partial sum of one column, then collect them,

reference: NVIDIA optimizing parallel reduction in CUDA

[codebox]define BLOCKSIZE_X 32

define BLOCKSIZE_Y 8

global void Tstat_v2(const float *X, float *Y, const int N, const int K)

{

__shared__ float sdata[BLOCKSIZE_X][BLOCKSIZE_Y];

int idx = threadIdx.x ;

int idy = threadIdx.y ;

int colIndex =  blockIdx.x * BLOCKSIZE_Y + idy ; 

if( colIndex < K){

// compute start of X segment

	int iter = colIndex*N;

// compute partial sum of elements

	sdata[idx][idy] = 0.0f ; 

	for (int n = idx; n < N; n+= BLOCKSIZE_X ){

		sdata[idx][idy] += X[iter + n];

	}

	__syncthreads();		

// collect partial sum

	if (BLOCKSIZE_X >= 32) sdata[idx][idy] += sdata[idx + 16][idy];

	if (BLOCKSIZE_X >= 16) sdata[idx][idy] += sdata[idx + 8 ][idy];

	if (BLOCKSIZE_X >= 8 ) sdata[idx][idy] += sdata[idx + 4 ][idy];

	if (BLOCKSIZE_X >= 4 ) sdata[idx][idy] += sdata[idx + 2 ][idy];

	if (BLOCKSIZE_X >= 2 ) sdata[idx][idy] += sdata[idx + 1 ][idy];	

// store back to Y

	if (0 == idx ) Y[ colIndex ] = sdata[0][idy] ;	

}// if (idx < K )

}

[/codebox]

[codebox]void exec_v2(float *X_d, float *Y_d, int N , int K )

{

int blockSize =  BLOCKSIZE_X * BLOCKSIZE_Y ;	

dim3 threads( BLOCKSIZE_X, BLOCKSIZE_Y, 1);

dim3 grid( (K + blockSize - 1 )/blockSize, 1, 1);

Tstat_v2 <<< grid, threads >>> (X_d, Y_d, N, K);

}

[/codebox]

GTX 295, for B = 10000 (execute 1000 times)

N = 400, K = 10000, time = 219 (ms)

N = 500, K = 10000, time = 235 (ms)

Tesla C1060, for B = 10000 (execute 1000 times)

N = 400, K = 10000, time = 218 (ms)

N = 500, K = 10000, time = 250 (ms)

the result is good enough

I have improved my code by

just interchange order of index in shared memory

[codebox]global void Tstat_v2(const float *X, float *Y, const int N, const int K)

{

__shared__ float sdata[BLOCKSIZE_Y][BLOCKSIZE_X];

int idx = threadIdx.x ;

int idy = threadIdx.y ;

int colIndex =  blockIdx.x * BLOCKSIZE_Y + idy ; 

if( colIndex < K){

// compute start of X segment

	int iter = colIndex*N;

// compute partial sum of elements

	sdata[idy][idx] = 0.0f ; 

	for (int n = idx; n < N; n+= BLOCKSIZE_X ){

		sdata[idy][idx] += X[iter + n];

	}

	__syncthreads();		

// collect partial sum

	if (BLOCKSIZE_X >= 32) sdata[idy][idx] += sdata[idy][idx + 16];

	if (BLOCKSIZE_X >= 16) sdata[idy][idx] += sdata[idy][idx + 8 ];

	if (BLOCKSIZE_X >= 8 ) sdata[idy][idx] += sdata[idy][idx + 4 ];

	if (BLOCKSIZE_X >= 4 ) sdata[idy][idx] += sdata[idy][idx + 2 ];

	if (BLOCKSIZE_X >= 2 ) sdata[idy][idx] += sdata[idy][idx + 1 ];	

// store back to Y

	if (0 == idx ) Y[ colIndex ] = sdata[idy][0] ;	

}// if (idx < K )

}

[/codebox]

GTX 295, for B = 10000 (execute 10000 times)

N = 400, K = 10000, time = 125 (ms)

N = 500, K = 10000, time = 156 (ms)

Tesla C1060, for B = 10000 (execute 10000 times)

N = 400, K = 10000, time = 141 (ms)

N = 500, K = 10000, time = 188 (ms)

Thanks for the feedback!

Are you sure the configuration is correct?

int blockSize = BLOCKSIZE_X * BLOCKSIZE_Y ;

dim3 threads( BLOCKSIZE_X, BLOCKSIZE_Y, 1);

dim3 grid( (K + blockSize - 1 )/blockSize, 1, 1);