I want to get the exec. time of different parts (code lines) in kernel function , is there any good ways ?
I have tried using clock64()
function in kernel. I have add timstamp, then get duration time of only one certain.
__global__ void reduce2(int* d_in, int* d_out) {
__shared__ int sdata[THREAD_PER_BLOCK];
int i = blockIdx.x * blockDim.x + threadIdx.x;
int tid = threadIdx.x;
clock_t t1, t2;
if(i==TID) {
t1 = clock64();
}
sdata[tid] = d_in[i];
__syncthreads();
// reduce
// 消除banck conflict
// s=128时, th0访问data[0] & data[128], th1访问data[1] & data[129], th31访问data[31] & data[31+128]
for(unsigned int s=blockDim.x/2; s>0; s>>=1){
if(tid < s){
sdata[tid]+=sdata[tid+s];
}
__syncthreads();
}
if(tid==0) {
d_out[blockIdx.x] = sdata[0];
}
if(i==TID) {
t2 = clock64();
float time_cost = (float)(t2 - t1)/CLOCK_RATE_KHZ;
printf("time cost=%fms\n", time_cost);
}
}
The CLOCK_RATE_KHZ
is obtained by here:
// A100, GPU Boost Clock=1410MHz
cudaDeviceProp prop;
CHECK_ERROR(cudaGetDeviceProperties(&prop, 1));
clock_t clock_rate = prop.clockRate; // Clock frequency in kilohertz
std::cout << "clock_rate(kHz): " << clock_rate << std::endl;
main function: (GPUTImer is implemented by cudaEvent)
GPUTimer gputimer;
gputimer.start();
reduce2<<<blocks_per_grid, threads_per_block>>>(d_a, d_out);
CHECK_ERROR(cudaGetLastError());
gputimer.stop();
printf("GPU time: %fms\n", gputimer.elapsed_ms());
Here is my result:
clock_rate(kHz): 1410000
time cost=0.002268ms (inside kernel time, one thread)
GPU time: 11.432960ms (whole kernel dur time)
the 0.002268ms is much less than whole kernel dur time. This there any wrong ?