Hello.
I use the VABSDIFF4 instruction in CUDA/C++ by Windows Visual Studio 2015.
However, there are differences in the number of instructions in vabsdiff4 compiled with VOLTA compatible GPUs and vabsdiff4 compiled with PASCAL compatible GPUs.
For this reason, performance is deteriorating and I am in trouble.
Would you tell me how to use the same instruction as PASCAL in VOLTA?
I using following GPUs.
The dump result of SASS by “cuobjdump” is shown below.
Use GPUs:
- VOLTA GPU: NVIDIA TITAN V
- PASCAL GPU: NVIDIA TITAN X (Pascal)
Kernel code:
__global__ void
cudaTest(unsigned int *result, unsigned int srcA, unsigned int srcB, unsigned int c)
{
unsigned int d;
asm volatile("vabsdiff4.u32.u32.u32.add %0,%1,%2,%3;":"=r"(d) : "r"(srcA), "r"(srcB), "r"(c));
*result = d;
}
PASCAL SASS dump (cuobjdump.exe -sass kernel.compute_61.sm_61.cubin):
code for sm_61
Function : _Z8cudaTestPjjjj
.headerflags @"EF_CUDA_SM61 EF_CUDA_PTX_SM(EF_CUDA_SM61)"
/* 0x001fc400fe2007f6 */
/*0008*/ MOV R1, c[0x0][0x20] ; /* 0x4c98078000870001 */
/*0010*/ MOV R0, c[0x0][0x148] ; /* 0x4c98078005270000 */
/*0018*/ MOV R5, c[0x0][0x14c] ; /* 0x4c98078005370005 */
/* 0x001fd000fe2007f1 */
/*0028*/ MOV R6, c[0x0][0x150] ; /* 0x4c98078005470006 */
/*0030*/ MOV R2, c[0x0][0x140] ; /* 0x4c98078005070002 */
/*0038*/ MOV R3, c[0x0][0x144] ; /* 0x4c98078005170003 */
/* 0x001fbc00fe2007e2 */
/*0048*/ VABSDIFF4.U8.U8.ACC R0, R0, R5, R6 ; /* 0x5038033440570000 */
/*0050*/ STG.E [R2], R0 ; /* 0xeedc200000070200 */
/*0058*/ NOP ; /* 0x50b0000000070f00 */
/* 0x001ffc00fc4007ef */
/*0068*/ NOP ; /* 0x50b0000000070f00 */
/*0070*/ NOP ; /* 0x50b0000000070f00 */
/*0078*/ EXIT ; /* 0xe30000000007000f */
/* 0x001f8000fc0007ff */
/*0088*/ BRA 0x80 ; /* 0xe2400fffff07000f */
/*0090*/ NOP; /* 0x50b0000000070f00 */
/*0098*/ NOP; /* 0x50b0000000070f00 */
/* 0x001f8000fc0007e0 */
/*00a8*/ NOP; /* 0x50b0000000070f00 */
/*00b0*/ NOP; /* 0x50b0000000070f00 */
/*00b8*/ NOP; /* 0x50b0000000070f00 */
...........................
VOLTA SASS dump (cuobjdump.exe -sass kernel.compute_70.sm_70.cubin):
code for sm_70
Function : _Z8cudaTestPjjjj
.headerflags @"EF_CUDA_SM70 EF_CUDA_PTX_SM(EF_CUDA_SM70)"
/*0000*/ @!PT SHFL.IDX PT, RZ, RZ, RZ, RZ ; /* 0x000000fffffff389 */
/* 0x000fe200000e00ff */
/*0010*/ IMAD.MOV.U32 R1, RZ, RZ, c[0x0][0x28] ; /* 0x00000a00ff017624 */
/* 0x000fd000078e00ff */
/*0020*/ IMAD.MOV.U32 R2, RZ, RZ, c[0x0][0x168] ; /* 0x00005a00ff027624 */
/* 0x000fe400078e00ff */
/*0030*/ IMAD.MOV.U32 R0, RZ, RZ, c[0x0][0x16c] ; /* 0x00005b00ff007624 */
/* 0x000fe400078e00ff */
/*0040*/ IMAD.MOV.U32 R3, RZ, RZ, 0x7654 ; /* 0x00007654ff037424 */
/* 0x000fc600078e00ff */
/*0050*/ PRMT R0, R2.reuse, 0x3210, R0 ; /* 0x0000321002007816 */
/* 0x040fe40000000000 */
/*0060*/ PRMT R2, R2, R3, c[0x0][0x16c] ; /* 0x00005b0002027616 */
/* 0x000fe40000000003 */
/*0070*/ PRMT R3, R0.reuse, 0x7650, RZ ; /* 0x0000765000037816 */
/* 0x040fe400000000ff */
/*0080*/ PRMT R4, R0, 0x7651, RZ ; /* 0x0000765100047816 */
/* 0x000fe400000000ff */
/*0090*/ PRMT R6, R2.reuse, 0x7650, RZ ; /* 0x0000765002067816 */
/* 0x040fe400000000ff */
/*00a0*/ PRMT R7, R2, 0x7651, RZ ; /* 0x0000765102077816 */
/* 0x000fc400000000ff */
/*00b0*/ PRMT R5, R0, 0x7652, RZ ; /* 0x0000765200057816 */
/* 0x000fe400000000ff */
/*00c0*/ PRMT R9, R2, 0x7653, RZ ; /* 0x0000765302097816 */
/* 0x000fe200000000ff */
/*00d0*/ IMAD.IADD R3, R3, 0x1, -R6 ; /* 0x0000000103037824 */
/* 0x000fe200078e0a06 */
/*00e0*/ PRMT R0, R0, 0x7653, RZ ; /* 0x0000765300007816 */
/* 0x000fe200000000ff */
/*00f0*/ IMAD.IADD R4, R4, 0x1, -R7 ; /* 0x0000000104047824 */
/* 0x000fe200078e0a07 */
/*0100*/ PRMT R2, R2, 0x7652, RZ ; /* 0x0000765202027816 */
/* 0x000fe400000000ff */
/*0110*/ IABS R3, R3 ; /* 0x0000000300037213 */
/* 0x000fe20000000000 */
/*0120*/ IMAD.IADD R0, R0, 0x1, -R9 ; /* 0x0000000100007824 */
/* 0x000fe200078e0a09 */
/*0130*/ IABS R4, R4 ; /* 0x0000000400047213 */
/* 0x000fe20000000000 */
/*0140*/ IMAD.IADD R2, R5, 0x1, -R2 ; /* 0x0000000105027824 */
/* 0x000fc600078e0a02 */
/*0150*/ IABS R5, R0 ; /* 0x0000000000057213 */
/* 0x000fe20000000000 */
/*0160*/ IMAD.MOV.U32 R0, RZ, RZ, R4 ; /* 0x000000ffff007224 */
/* 0x000fe200078e0004 */
/*0170*/ IABS R6, R2 ; /* 0x0000000200067213 */
/* 0x000fe20000000000 */
/*0180*/ IMAD.MOV.U32 R2, RZ, RZ, R3 ; /* 0x000000ffff027224 */
/* 0x000fca00078e0003 */
/*0190*/ IADD3 R0, R0, c[0x0][0x170], R2 ; /* 0x00005c0000007a10 */
/* 0x000fe20007ffe002 */
/*01a0*/ IMAD.MOV.U32 R2, RZ, RZ, c[0x0][0x160] ; /* 0x00005800ff027624 */
/* 0x000fe400078e00ff */
/*01b0*/ IMAD.MOV.U32 R3, RZ, RZ, c[0x0][0x164] ; /* 0x00005900ff037624 */
/* 0x000fe200078e00ff */
/*01c0*/ IADD3 R0, R5, R0, R6 ; /* 0x0000000005007210 */
/* 0x000fd00007ffe006 */
/*01d0*/ STG.E.SYS [R2], R0 ; /* 0x0000000002007386 */
/* 0x000fe2000010e900 */
/*01e0*/ EXIT ; /* 0x000000000000794d */
/* 0x000fea0003800000 */
/*01f0*/ BRA 0x1f0; /* 0xfffffff000007947 */
/* 0x000fc0000383ffff */
...........................