__device__ __forceinline__
void prefetch_64(const void *ptr, uint32_t ret_0) {
asm volatile (
"ld.global.L2::64B.b32 %0, [%1];"
: "=r"(ret_0)
: "l"(ptr)
);
return;
}
// 定义预取函数,使用 prefetch.global.L2::evict_last 指令
__device__ __forceinline__
void prefetch_l2(const void *ptr) {
// 使用 prefetch.global.L2::evict_last 指令预取数据到 L2 缓存
// 并指定 L2 的逐出策略为 evict_last (驱逐最近最少使用的数据)
asm volatile (
"prefetch.global.L2 [%0];"
: // 没有输出操作数
: "l"(ptr) // 输入操作数,%0 对应 ptr, "l" 表示内存地址
);
}
int start = img_idx + tid/32 * W * C + tid%32 * C;
uint32_t dummy; // 虚拟变量
for(int pre_num=0; pre_num<BLOCK_H/(tid/32);pre_num++){
prefetch_64(&inp[start + pre_num * 4 *W *C],dummy);
}
//load GM to SM
//img 16*16*32
//thread 32*4
uint32_t img_frag[16*2];
int imgm;
int shm_bank;
int iter = BLOCK_H*BLOCK_W * C / (produce_num * 8);
#pragma unroll
for(int i = 0; i<iter; i++){
//所在img位置+所在block位置+所在warp位置+所在thread位置
imgm = img_idx + i * 2 * W * C + warp_id/2 * W * C + tid%64 * 8;
int bs_gm = imgm/(H*W*C);
int h_gm = imgm%(H*W*C)/(W*C);
int w_gm = imgm%(H*W*C)%(W*C)/C;
if(bs_gm<BS && h_gm>=h_idx && h_gm<h_idx+block_h && w_gm>=w_idx && w_gm<w_idx+block_w){
*((float4*)&img_frag[0 + i*4]) = *((float4*)&inp[imgm]);
//ldg_cg_v4(&inp[imgm],img_frag[0 + i*4],img_frag[1 + i*4],img_frag[2 + i*4],img_frag[3 + i*4]);
shm_bank = swizzle(tid*8 + i * 32 * 32);
*((float4*)&img_shm[shm_bank]) = *(float4*)(&img_frag[0 + i*4]);
}
}
In this data loading code, I attempted to use the prefetch instruction to accelerate the data loading because the access pattern of the data is determined, and all the data loaded by sm in one wave can be placed in the L2cache. I checked the PTX documentation and found that there are the above two prefetch instructions. By testing these two instructions and comparing them with not using them, I found that the first instruction had no change to L2cache hit, and the data loading time also remained unchanged. Moreover, when I examined the PTX code and SASS code of this instruction, I discovered that this instruction existed at the PTX level. However, at the sass level, the prefetched sass instruction was not seen, and its corresponding sass code is LDG.E.U16.SYS R5, [R2]; In the experiment of the second prefetch instruction, I found that L2cache hit increased by 15% to 87%, but the data loading time became higher, which puzzled me a lot. When I checked the sass code of this instruction, I found that it was indeed the prefetch instruction CCTL.E.PF2 [R2]. This phenomenon puzzles me a lot. I want to know why. Please help me.