Indeed, according to my testing, the reason for the very large number (hundreds of thousands) of cycles is UM demand-paging. By prefetching or switching to the non-managed device memory, the measurement (for me) drops from hundreds of thousands of cycles to about 1000 cycles. Still seems high. A write is supposed to be fire-and-forget, i.e. not many clock cycles to dispatch.
- Studying PTX isn’t that useful for these kinds of exercises, in my experience/opinion.
- In my experience, its pretty important to study the SASS to make sure that the use of
clock64()
or other similar in-kernel timing is actually bracketing what you want.
- a synchronization or two may also help with isolation.
Let’s go through a few steps. If I compile the code as-is for my cc8.9 L4 GPU, I get output in the hundreds of thousands:
# nvcc -o t270a t270a.cu -arch=sm_89
# ./t270a
#967101 #967101 #967101 #967101 #967101 #967101 #967101 #967101 #967101 #967101 #967101 #967101 #967101 #967101 #967101 #967101 #967101 #967101 #967101 #967101 #967101 #967101 #967101 #967101 #967101 #967101 #967101 #967101 #967101 #967101 #967101 #967101 #967104 #967104 #967104 #967104 #967104 #967104 #967104 #967104 #967104 #967104 #967104 #967104 #967104 #967104 #967104 #967104 #967104 #967104 #967104 #967104 #967104 #967104 #967104 #967104 #967104 #967104 #967104 #967104 #967104 #967104 #967104 #967104
###########
#967082 #967082 #967082 #967082 #967082 #967082 #967082 #967082 #967082 #967082 #967082 #967082 #967082 #967082 #967082 #967082 #967082 #967082 #967082 #967082 #967082 #967082 #967082 #967082 #967082 #967082 #967082 #967082 #967082 #967082 #967082 #967082 #967082 #967082 #967082 #967082 #967082 #967082 #967082 #967082 #967082 #967082 #967082 #967082 #967082 #967082 #967082 #967082 #967082 #967082 #967082 #967082 #967082 #967082 #967082 #967082 #967082 #967082 #967082 #967082 #967082 #967082 #967082 #967082
###########
#967103 #967103 #967103 #967103 #967103 #967103 #967103 #967103 #967103 #967103 #967103 #967103 #967103 #967103 #967103 #967103 #967103 #967103 #967103 #967103 #967103 #967103 #967103 #967103 #967103 #967103 #967103 #967103 #967103 #967103 #967103 #967103 #967103 #967103 #967103 #967103 #967103 #967103 #967103 #967103 #967103 #967103 #967103 #967103 #967103 #967103 #967103 #967103 #967103 #967103 #967103 #967103 #967103 #967103 #967103 #967103 #967103 #967103 #967103 #967103 #967103 #967103 #967103 #967103
###########
#967093 #967093 #967093 #967093 #967093 #967093 #967093 #967093 #967093 #967093 #967093 #967093 #967093 #967093 #967093 #967093 #967093 #967093 #967093 #967093 #967093 #967093 #967093 #967093 #967093 #967093 #967093 #967093 #967093 #967093 #967093 #967093 #967093 #967093 #967093 #967093 #967093 #967093 #967093 #967093 #967093 #967093 #967093 #967093 #967093 #967093 #967093 #967093 #967093 #967093 #967093 #967093 #967093 #967093 #967093 #967093 #967093 #967093 #967093 #967093 #967093 #967093 #967093 #967093
###########
If we switch to using non-managed memory (and remove the kernel loop of 1), the measurement drops to around 1000:
# cat t270.cu
#include <stdint.h>
#include <iostream>
#define BLOCKS 4
#define THREADS_PER_BLOCK 64
__global__ void test_load_time(int * data_ptr, uint64_t* result_ptr){
int global_id = blockDim.x * blockIdx.x + threadIdx.x;
int max_items = gridDim.x*blockDim.x;
int load_temp=data_ptr[global_id+max_items];
auto start_time = clock64();
data_ptr[global_id]=load_temp;
auto end_time= clock64();
result_ptr[global_id]=(end_time-start_time);
}
int main(){
int * data_ptr;
cudaMalloc(&data_ptr, sizeof(int)*BLOCKS*THREADS_PER_BLOCK*2);
uint64_t * result_p;
cudaMalloc(&result_p, sizeof(uint64_t)*BLOCKS*THREADS_PER_BLOCK);
test_load_time<<<BLOCKS,THREADS_PER_BLOCK>>>(data_ptr,result_p);
uint64_t * hresult_p = new uint64_t[BLOCKS*THREADS_PER_BLOCK];
cudaMemcpy(hresult_p, result_p, sizeof(uint64_t)*BLOCKS*THREADS_PER_BLOCK, cudaMemcpyDeviceToHost);
for(size_t i=0;i<BLOCKS; i++){
for(size_t j =0; j<THREADS_PER_BLOCK; j++){
std::cout<<"#"<<hresult_p[i*THREADS_PER_BLOCK+j]<<"\t";
}
std::cout<<"\n###########\n";
}
}
# nvcc -o t270 t270.cu -arch=sm_89
# ./t270
#1094 #1094 #1094 #1094 #1094 #1094 #1094 #1094 #1094 #1094 #1094 #1094 #1094 #1094 #1094 #1094 #1094 #1094 #1094 #1094 #1094 #1094 #1094 #1094 #1094 #1094 #1094 #1094 #1094 #1094 #1094 #1094 #1094 #1094 #1094 #1094 #1094 #1094 #1094 #1094 #1094 #1094 #1094 #1094 #1094 #1094 #1094 #1094 #1094 #1094 #1094 #1094 #1094 #1094 #1094 #1094 #1094 #1094 #1094 #1094 #1094 #1094 #1094 #1094
###########
#1014 #1014 #1014 #1014 #1014 #1014 #1014 #1014 #1014 #1014 #1014 #1014 #1014 #1014 #1014 #1014 #1014 #1014 #1014 #1014 #1014 #1014 #1014 #1014 #1014 #1014 #1014 #1014 #1014 #1014 #1014 #1014 #1014 #1014 #1014 #1014 #1014 #1014 #1014 #1014 #1014 #1014 #1014 #1014 #1014 #1014 #1014 #1014 #1014 #1014 #1014 #1014 #1014 #1014 #1014 #1014 #1014 #1014 #1014 #1014 #1014 #1014 #1014 #1014
###########
#1087 #1087 #1087 #1087 #1087 #1087 #1087 #1087 #1087 #1087 #1087 #1087 #1087 #1087 #1087 #1087 #1087 #1087 #1087 #1087 #1087 #1087 #1087 #1087 #1087 #1087 #1087 #1087 #1087 #1087 #1087 #1087 #1087 #1087 #1087 #1087 #1087 #1087 #1087 #1087 #1087 #1087 #1087 #1087 #1087 #1087 #1087 #1087 #1087 #1087 #1087 #1087 #1087 #1087 #1087 #1087 #1087 #1087 #1087 #1087 #1087 #1087 #1087 #1087
###########
#1024 #1024 #1024 #1024 #1024 #1024 #1024 #1024 #1024 #1024 #1024 #1024 #1024 #1024 #1024 #1024 #1024 #1024 #1024 #1024 #1024 #1024 #1024 #1024 #1024 #1024 #1024 #1024 #1024 #1024 #1024 #1024 #1024 #1024 #1024 #1024 #1024 #1024 #1024 #1024 #1024 #1024 #1024 #1024 #1024 #1024 #1024 #1024 #1024 #1024 #1024 #1024 #1024 #1024 #1024 #1024 #1024 #1024 #1024 #1024 #1024 #1024 #1024 #1024
###########
# cuobjdump -sass ./t270
Fatbin elf code:
================
arch = sm_89
code version = [1,7]
host = linux
compile_size = 64bit
code for sm_89
Fatbin elf code:
================
arch = sm_89
code version = [1,7]
host = linux
compile_size = 64bit
code for sm_89
Function : _Z14test_load_timePiPm
.headerflags @"EF_CUDA_TEXMODE_UNIFIED EF_CUDA_64BIT_ADDRESS EF_CUDA_SM89 EF_CUDA_VIRTUAL_SM(EF_CUDA_SM89)"
/*0000*/ MOV R1, c[0x0][0x28] ; /* 0x00000a0000017a02 */
/* 0x000fc40000000f00 */
/*0010*/ S2R R0, SR_CTAID.X ; /* 0x0000000000007919 */
/* 0x000e220000002500 */
/*0020*/ MOV R5, c[0x0][0xc] ; /* 0x0000030000057a02 */
/* 0x000fe20000000f00 */
/*0030*/ ULDC.64 UR4, c[0x0][0x118] ; /* 0x0000460000047ab9 */
/* 0x000fe20000000a00 */
/*0040*/ MOV R9, 0x4 ; /* 0x0000000400097802 */
/* 0x000fe20000000f00 */
/*0050*/ S2R R3, SR_TID.X ; /* 0x0000000000037919 */
/* 0x000e240000002100 */
/*0060*/ IMAD R0, R0, c[0x0][0x0], R3 ; /* 0x0000000000007a24 */
/* 0x001fc800078e0203 */
/*0070*/ IMAD R2, R5, c[0x0][0x0], R0 ; /* 0x0000000005027a24 */
/* 0x000fc800078e0200 */
/*0080*/ IMAD.WIDE R2, R2, R9, c[0x0][0x160] ; /* 0x0000580002027625 */
/* 0x000fca00078e0209 */
/*0090*/ LDG.E R7, [R2.64] ; /* 0x0000000402077981 */
/* 0x000164000c1e1900 */
/*00a0*/ CS2R R4, SR_CLOCKLO ; /* 0x0000000000047805 */
/* 0x000fc40000015000 */
/*00b0*/ IMAD.WIDE R2, R0, R9, c[0x0][0x160] ; /* 0x0000580000027625 */
/* 0x001fca00078e0209 */
/*00c0*/ STG.E [R2.64], R7 ; /* 0x0000000702007986 */
/* 0x0201e4000c101904 */
/*00d0*/ CS2R R2, SR_CLOCKLO ; /* 0x0000000000027805 */
/* 0x001fcc0000015000 */
/*00e0*/ IADD3 R4, P0, -R4, R2, RZ ; /* 0x0000000204047210 */
/* 0x000fe40007f1e1ff */
/*00f0*/ MOV R7, 0x8 ; /* 0x0000000800077802 */
/* 0x000fe40000000f00 */
/*0100*/ IADD3.X R5, ~R5, R3, RZ, P0, !PT ; /* 0x0000000305057210 */
/* 0x000fc600007fe5ff */
/*0110*/ IMAD.WIDE R2, R0, R7, c[0x0][0x168] ; /* 0x00005a0000027625 */
/* 0x000fca00078e0207 */
/*0120*/ STG.E.64 [R2.64], R4 ; /* 0x0000000402007986 */
/* 0x000fe2000c101b04 */
/*0130*/ EXIT ; /* 0x000000000000794d */
/* 0x000fea0003800000 */
/*0140*/ BRA 0x140; /* 0xfffffff000007947 */
/* 0x000fc0000383ffff */
/*0150*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0160*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0170*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0180*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0190*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*01a0*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*01b0*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*01c0*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*01d0*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*01e0*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*01f0*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
..........
Fatbin ptx code:
================
arch = sm_89
code version = [8,2]
host = linux
compile_size = 64bit
compressed
#
But that still seems long. Aren’t we just timing a single write? Aren’t GPU writes “fire and forget”? Shouldn’t the instruction latency be pretty low, like maybe 20, not 1000? Studying the SASS above shows
LDG.E R7, [R2.64] ; // int load_temp=data_ptr[global_id+max_items];
CS2R R4, SR_CLOCKLO ; // auto start_time = clock64();
IMAD.WIDE R2, R0, R9, c[0x0][0x160] ;
STG.E [R2.64], R7 ; // data_ptr[global_id]=load_temp;
CS2R R2, SR_CLOCKLO ; // auto end_time= clock64();
There are two issues.
- the store operation (STG) has a register-dependency via R7 on the load operation (LDG) which is loading R7. Therefore the store operation cannot be issued until the load latency completes, which is “long”. To get around this, I will propose to put in an execution/memory barrier prior to the first timestamp, but after the load.
- We have the extra index calculation instruction (IMAD) that the compiler put there, but maybe we don’t want it there. To get around that, I will suggest issuing the same instruction prior to the barrier, forcing the compiler to schedule the index calculation earlier.
The modified code and results are like this:
# cat t270.cu
#include <stdint.h>
#include <iostream>
#define BLOCKS 4
#define THREADS_PER_BLOCK 64
__global__ void test_load_time(int * data_ptr, uint64_t* result_ptr){
int global_id = blockDim.x * blockIdx.x + threadIdx.x;
int max_items = gridDim.x*blockDim.x;
int load_temp=data_ptr[global_id+max_items];
data_ptr[global_id]=load_temp; // added
__syncthreads(); // added
auto start_time = clock64();
data_ptr[global_id]=load_temp;
auto end_time= clock64();
result_ptr[global_id]=(end_time-start_time);
}
int main(){
int * data_ptr;
cudaMalloc(&data_ptr, sizeof(int)*BLOCKS*THREADS_PER_BLOCK*2);
uint64_t * result_p;
cudaMalloc(&result_p, sizeof(uint64_t)*BLOCKS*THREADS_PER_BLOCK);
test_load_time<<<BLOCKS,THREADS_PER_BLOCK>>>(data_ptr,result_p);
uint64_t * hresult_p = new uint64_t[BLOCKS*THREADS_PER_BLOCK];
cudaMemcpy(hresult_p, result_p, sizeof(uint64_t)*BLOCKS*THREADS_PER_BLOCK, cudaMemcpyDeviceToHost);
for(size_t i=0;i<BLOCKS; i++){
for(size_t j =0; j<THREADS_PER_BLOCK; j++){
std::cout<<"#"<<hresult_p[i*THREADS_PER_BLOCK+j]<<"\t";
}
std::cout<<"\n###########\n";
}
}
# nvcc -o t270 t270.cu -arch=sm_89
# ./t270
#18 #18 #18 #18 #18 #18 #18 #18 #18 #18 #18 #18 #18 #18 #18 #18 #18#18 #18 #18 #18 #18 #18 #18 #18 #18 #18 #18 #18 #18 #18 #18 #20 #20#20 #20 #20 #20 #20 #20 #20 #20 #20 #20 #20 #20 #20 #20 #20 #20 #20#20 #20 #20 #20 #20 #20 #20 #20 #20 #20 #20 #20 #20
###########
#18 #18 #18 #18 #18 #18 #18 #18 #18 #18 #18 #18 #18 #18 #18 #18 #18#18 #18 #18 #18 #18 #18 #18 #18 #18 #18 #18 #18 #18 #18 #18 #20 #20#20 #20 #20 #20 #20 #20 #20 #20 #20 #20 #20 #20 #20 #20 #20 #20 #20#20 #20 #20 #20 #20 #20 #20 #20 #20 #20 #20 #20 #20
###########
#18 #18 #18 #18 #18 #18 #18 #18 #18 #18 #18 #18 #18 #18 #18 #18 #18#18 #18 #18 #18 #18 #18 #18 #18 #18 #18 #18 #18 #18 #18 #18 #20 #20#20 #20 #20 #20 #20 #20 #20 #20 #20 #20 #20 #20 #20 #20 #20 #20 #20#20 #20 #20 #20 #20 #20 #20 #20 #20 #20 #20 #20 #20
###########
#18 #18 #18 #18 #18 #18 #18 #18 #18 #18 #18 #18 #18 #18 #18 #18 #18#18 #18 #18 #18 #18 #18 #18 #18 #18 #18 #18 #18 #18 #18 #18 #20 #20#20 #20 #20 #20 #20 #20 #20 #20 #20 #20 #20 #20 #20 #20 #20 #20 #20#20 #20 #20 #20 #20 #20 #20 #20 #20 #20 #20 #20 #20
###########
# cuobjdump -sass ./t270
Fatbin elf code:
================
arch = sm_89
code version = [1,7]
host = linux
compile_size = 64bit
code for sm_89
Fatbin elf code:
================
arch = sm_89
code version = [1,7]
host = linux
compile_size = 64bit
code for sm_89
Function : _Z14test_load_timePiPm
.headerflags @"EF_CUDA_TEXMODE_UNIFIED EF_CUDA_64BIT_ADDRESS EF_CUDA_SM89 EF_CUDA_VIRTUAL_SM(EF_CUDA_SM89)"
/*0000*/ MOV R1, c[0x0][0x28] ; /* 0x00000a0000017a02 */
/* 0x000fc40000000f00 */
/*0010*/ S2R R0, SR_CTAID.X ; /* 0x0000000000007919 */
/* 0x000e220000002500 */
/*0020*/ MOV R5, c[0x0][0xc] ; /* 0x0000030000057a02 */
/* 0x000fe20000000f00 */
/*0030*/ ULDC.64 UR4, c[0x0][0x118] ; /* 0x0000460000047ab9 */
/* 0x000fe20000000a00 */
/*0040*/ MOV R9, 0x4 ; /* 0x0000000400097802 */
/* 0x000fe20000000f00 */
/*0050*/ S2R R3, SR_TID.X ; /* 0x0000000000037919 */
/* 0x000e240000002100 */
/*0060*/ IMAD R0, R0, c[0x0][0x0], R3 ; /* 0x0000000000007a24 */
/* 0x001fc800078e0203 */
/*0070*/ IMAD R2, R5, c[0x0][0x0], R0 ; /* 0x0000000005027a24 */
/* 0x000fc800078e0200 */
/*0080*/ IMAD.WIDE R2, R2, R9, c[0x0][0x160] ; /* 0x0000580002027625 */
/* 0x000fca00078e0209 */
/*0090*/ LDG.E R7, [R2.64] ; /* 0x0000000402077981 */
/* 0x000ea2000c1e1900 */
/*00a0*/ IMAD.WIDE R4, R0, R9, c[0x0][0x160] ; /* 0x0000580000047625 */
/* 0x000fca00078e0209 */
/*00b0*/ STG.E [R4.64], R7 ; /* 0x0000000704007986 */
/* 0x004fe8000c101904 */
/*00c0*/ BAR.SYNC 0x0 ; /* 0x0000000000007b1d */
/* 0x000fec0000000000 */
/*00d0*/ CS2R R2, SR_CLOCKLO ; /* 0x0000000000027805 */
/* 0x000fca0000015000 */
/*00e0*/ STG.E [R4.64], R7 ; /* 0x0000000704007986 */
/* 0x0001e4000c101904 */
/*00f0*/ CS2R R4, SR_CLOCKLO ; /* 0x0000000000047805 */
/* 0x001fcc0000015000 */
/*0100*/ IADD3 R4, P0, -R2, R4, RZ ; /* 0x0000000402047210 */
/* 0x000fe40007f1e1ff */
/*0110*/ MOV R7, 0x8 ; /* 0x0000000800077802 */
/* 0x000fe40000000f00 */
/*0120*/ IADD3.X R5, ~R3, R5, RZ, P0, !PT ; /* 0x0000000503057210 */
/* 0x000fc600007fe5ff */
/*0130*/ IMAD.WIDE R2, R0, R7, c[0x0][0x168] ; /* 0x00005a0000027625 */
/* 0x000fca00078e0207 */
/*0140*/ STG.E.64 [R2.64], R4 ; /* 0x0000000402007986 */
/* 0x000fe2000c101b04 */
/*0150*/ EXIT ; /* 0x000000000000794d */
/* 0x000fea0003800000 */
/*0160*/ BRA 0x160; /* 0xfffffff000007947 */
/* 0x000fc0000383ffff */
/*0170*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0180*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0190*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*01a0*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*01b0*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*01c0*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*01d0*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*01e0*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*01f0*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
..........
Fatbin ptx code:
================
arch = sm_89
code version = [8,2]
host = linux
compile_size = 64bit
compressed
#
Now the SASS shows just the timestamps with only the STG operation in-between, and we have “pushed” the LDG latency out of the timing region with __syncthreads().
Does 18 still seem long? Perhaps. I suspect that this timing method includes the issue latency of the STG operation plus at least one issue latency of CS2R from the timing bracketing. If that is the case, it seems unavoidable to me.
I don’t personally know of a general answer to this question. In my experience, you have to analyze the SASS, understand what is contributing, and then use code modification to produce a SASS sequence that is as close as possible to desired. For me personally, its a bit of trial-and-error. I don’t know of a single formula or template to accomplish it.