I am trying to understand how clock() works on TITAN (cc3.5). Here is the CUDA code modified from this benchmark: Demystifying GPU Microarchitecture through Microbenchmarking | stuffedcow
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#define repeat2(S) S S
#define repeat3(S) repeat2(S) S
#define repeat4(S) repeat2(S S)
#define repeat5(S) repeat4(S) S
__global__ void kclock_test2 (unsigned int *ts, unsigned int *out, int p1, int p2, unsigned int its)
{
unsigned int t1 = p1;
unsigned int t2 = p2;
unsigned int start_time = 0, stop_time = 0;
unsigned int pid = blockIdx.x*blockDim.x + threadIdx.x;
for (int i = 0; i < its; i++)
{
start_time = clock();
repeat5(t1+=t2;t2+=t1;);
stop_time = clock();
}
out[0] = t1+t2;
ts[pid*2] = start_time;
ts[pid*2+1] = stop_time;
}
int main()
{
//measure_clock();
unsigned int ts[1024];
unsigned int *d_ts;
unsigned int *d_out;
// Allocate device array.
cudaError_t errcode;
if (cudaSuccess != (errcode = cudaMalloc((void**)&d_ts, sizeof(ts))))
{
printf ("cudaMalloc failed %s:%d\n", __FILE__, __LINE__);
printf (" %s\n", cudaGetErrorString(errcode));
return 0;
}
if (cudaSuccess != (errcode = cudaMalloc((void**)&d_out, 4)))
{
printf ("cudaMalloc failed %s:%d\n", __FILE__, __LINE__);
printf (" %s\n", cudaGetErrorString(errcode));
return 0;
}
cudaGetLastError();
dim3 Db = dim3(1);
dim3 Dg = dim3(1,1,1);
Dg.x = 14; //1 block per SM
Db.x = 1; //1 thread per block
kclock_test2 <<<Dg, Db>>>(d_ts, d_out, 4, 6, 2);
cudaThreadSynchronize();
cudaMemcpy(ts, d_ts, sizeof(ts), cudaMemcpyDeviceToHost);
for (int i=0; i < 14; i++)
printf (" Block %02d: start: %08u, stop: %08u, diff: %u\n", i, ts[i*2], ts[i*2+1], ts[i*2+1]-ts[i*2]);
cudaFree (d_ts);
cudaFree (d_out);
return 0;
}
When I disassembled the code using cuobjdump (9.0) here is the relevant part of the SASS code:
S2R R8, SR_CLOCKLO; //SCHI: 0x2f 16 cycles
IADD R2, R2, R3; //SCHI: 0x28 9 cycles
IADD R3, R3, R2; //SCHI: 0x28 9 cycles
IADD R2, R2, R3; //SCHI: 0x28 9 cycles
IADD R3, R3, R2; //SCHI: 0x28 9 cycles
IADD R2, R2, R3; //SCHI: 0x28 9 cycles
IADD R3, R3, R2; //SCHI: 0x28 9 cycles
IADD R2, R2, R3; //SCHI: 0x28 9 cycles
IADD R3, R3, R2; //SCHI: 0x28 9 cycles
IADD R2, R2, R3; //SCHI: 0x2f 16 cycles
S2R R7, SR_CLOCKLO; //SCHI: 0x2f 16 cycles
From the stall counts of each instruction, the difference between R7 and R8 should be 104=16+9*8+16 theoretically. But when I run the code on Ubuntu 18.04 with TITAN, here is the result
Block 00: start: 01816203, stop: 01816307, diff: 104
Block 01: start: 01815641, stop: 01815745, diff: 104
Block 02: start: 01815448, stop: 01815552, diff: 104
Block 03: start: 01815143, stop: 01815247, diff: 104
Block 04: start: 01816237, stop: 01816341, diff: 104
Block 05: start: 01815643, stop: 01815747, diff: 104
Block 06: start: 01815446, stop: 01815550, diff: 104
Block 07: start: 01815134, stop: 01815238, diff: 104
Block 08: start: 01814428, stop: 01814570, diff: 142
Block 09: start: 01816237, stop: 01816341, diff: 104
Block 10: start: 01815644, stop: 01815791, diff: 147
Block 11: start: 01815446, stop: 01815550, diff: 104
Block 12: start: 01815143, stop: 01815247, diff: 104
Block 13: start: 01814425, stop: 01814529, diff: 104
Here are my questions with my assumptions
-
I am launching 14 blocks because I assume each block will run on each SMX. From the result, the diff is 104 as expected from some SMs, but not true for others. Why different SMs behave differently?
-
Every time I run the code, I got different numbers for each SM. Does the clock() value depends on some runtime environment? Maybe the hardware does not respect the stall counts completely?
-
I learned that the clock reading instruction (S2R R1, SR_CLOCKLO) is a varying-latency instruction for cc<7.0. However, when I try to put two clock reading instruction back2back, I always get the same diff (16 cycles) between the two readings. Does it mean the latency of the clock reading instruction is always 16 cycles?
Many thanks in advance!!