The instructions of VABSDIFF4 increases in CUDA9.2 (VOLTA)

Hello.

I use the VABSDIFF4 instruction in CUDA/C++ by Windows Visual Studio 2015.

However, there are differences in the number of instructions in vabsdiff4 compiled with VOLTA compatible GPUs and vabsdiff4 compiled with PASCAL compatible GPUs.
For this reason, performance is deteriorating and I am in trouble.

Would you tell me how to use the same instruction as PASCAL in VOLTA?

I using following GPUs.
The dump result of SASS by “cuobjdump” is shown below.

Use GPUs:

  • VOLTA GPU: NVIDIA TITAN V
  • PASCAL GPU: NVIDIA TITAN X (Pascal)

Kernel code:

__global__ void 
cudaTest(unsigned int *result, unsigned int srcA, unsigned int srcB, unsigned int c)
{
	unsigned int d;
	asm volatile("vabsdiff4.u32.u32.u32.add %0,%1,%2,%3;":"=r"(d) : "r"(srcA), "r"(srcB), "r"(c));

	*result = d;
}

PASCAL SASS dump (cuobjdump.exe -sass kernel.compute_61.sm_61.cubin):

code for sm_61
		Function : _Z8cudaTestPjjjj
	.headerflags    @"EF_CUDA_SM61 EF_CUDA_PTX_SM(EF_CUDA_SM61)"
                                                                         /* 0x001fc400fe2007f6 */
        /*0008*/                   MOV R1, c[0x0][0x20] ;                /* 0x4c98078000870001 */
        /*0010*/                   MOV R0, c[0x0][0x148] ;               /* 0x4c98078005270000 */
        /*0018*/                   MOV R5, c[0x0][0x14c] ;               /* 0x4c98078005370005 */
                                                                         /* 0x001fd000fe2007f1 */
        /*0028*/                   MOV R6, c[0x0][0x150] ;               /* 0x4c98078005470006 */
        /*0030*/                   MOV R2, c[0x0][0x140] ;               /* 0x4c98078005070002 */
        /*0038*/                   MOV R3, c[0x0][0x144] ;               /* 0x4c98078005170003 */
                                                                         /* 0x001fbc00fe2007e2 */
        /*0048*/                   VABSDIFF4.U8.U8.ACC R0, R0, R5, R6 ;  /* 0x5038033440570000 */
        /*0050*/                   STG.E [R2], R0 ;                      /* 0xeedc200000070200 */
        /*0058*/                   NOP ;                                 /* 0x50b0000000070f00 */
                                                                         /* 0x001ffc00fc4007ef */
        /*0068*/                   NOP ;                                 /* 0x50b0000000070f00 */
        /*0070*/                   NOP ;                                 /* 0x50b0000000070f00 */
        /*0078*/                   EXIT ;                                /* 0xe30000000007000f */
                                                                         /* 0x001f8000fc0007ff */
        /*0088*/                   BRA 0x80 ;                            /* 0xe2400fffff07000f */
        /*0090*/                   NOP;                                  /* 0x50b0000000070f00 */
        /*0098*/                   NOP;                                  /* 0x50b0000000070f00 */
                                                                         /* 0x001f8000fc0007e0 */
        /*00a8*/                   NOP;                                  /* 0x50b0000000070f00 */
        /*00b0*/                   NOP;                                  /* 0x50b0000000070f00 */
        /*00b8*/                   NOP;                                  /* 0x50b0000000070f00 */
		...........................

VOLTA SASS dump (cuobjdump.exe -sass kernel.compute_70.sm_70.cubin):

code for sm_70
		Function : _Z8cudaTestPjjjj
	.headerflags    @"EF_CUDA_SM70 EF_CUDA_PTX_SM(EF_CUDA_SM70)"
        /*0000*/              @!PT SHFL.IDX PT, RZ, RZ, RZ, RZ ;             /* 0x000000fffffff389 */
                                                                             /* 0x000fe200000e00ff */
        /*0010*/                   IMAD.MOV.U32 R1, RZ, RZ, c[0x0][0x28] ;   /* 0x00000a00ff017624 */
                                                                             /* 0x000fd000078e00ff */
        /*0020*/                   IMAD.MOV.U32 R2, RZ, RZ, c[0x0][0x168] ;  /* 0x00005a00ff027624 */
                                                                             /* 0x000fe400078e00ff */
        /*0030*/                   IMAD.MOV.U32 R0, RZ, RZ, c[0x0][0x16c] ;  /* 0x00005b00ff007624 */
                                                                             /* 0x000fe400078e00ff */
        /*0040*/                   IMAD.MOV.U32 R3, RZ, RZ, 0x7654 ;         /* 0x00007654ff037424 */
                                                                             /* 0x000fc600078e00ff */
        /*0050*/                   PRMT R0, R2.reuse, 0x3210, R0 ;           /* 0x0000321002007816 */
                                                                             /* 0x040fe40000000000 */
        /*0060*/                   PRMT R2, R2, R3, c[0x0][0x16c] ;          /* 0x00005b0002027616 */
                                                                             /* 0x000fe40000000003 */
        /*0070*/                   PRMT R3, R0.reuse, 0x7650, RZ ;           /* 0x0000765000037816 */
                                                                             /* 0x040fe400000000ff */
        /*0080*/                   PRMT R4, R0, 0x7651, RZ ;                 /* 0x0000765100047816 */
                                                                             /* 0x000fe400000000ff */
        /*0090*/                   PRMT R6, R2.reuse, 0x7650, RZ ;           /* 0x0000765002067816 */
                                                                             /* 0x040fe400000000ff */
        /*00a0*/                   PRMT R7, R2, 0x7651, RZ ;                 /* 0x0000765102077816 */
                                                                             /* 0x000fc400000000ff */
        /*00b0*/                   PRMT R5, R0, 0x7652, RZ ;                 /* 0x0000765200057816 */
                                                                             /* 0x000fe400000000ff */
        /*00c0*/                   PRMT R9, R2, 0x7653, RZ ;                 /* 0x0000765302097816 */
                                                                             /* 0x000fe200000000ff */
        /*00d0*/                   IMAD.IADD R3, R3, 0x1, -R6 ;              /* 0x0000000103037824 */
                                                                             /* 0x000fe200078e0a06 */
        /*00e0*/                   PRMT R0, R0, 0x7653, RZ ;                 /* 0x0000765300007816 */
                                                                             /* 0x000fe200000000ff */
        /*00f0*/                   IMAD.IADD R4, R4, 0x1, -R7 ;              /* 0x0000000104047824 */
                                                                             /* 0x000fe200078e0a07 */
        /*0100*/                   PRMT R2, R2, 0x7652, RZ ;                 /* 0x0000765202027816 */
                                                                             /* 0x000fe400000000ff */
        /*0110*/                   IABS R3, R3 ;                             /* 0x0000000300037213 */
                                                                             /* 0x000fe20000000000 */
        /*0120*/                   IMAD.IADD R0, R0, 0x1, -R9 ;              /* 0x0000000100007824 */
                                                                             /* 0x000fe200078e0a09 */
        /*0130*/                   IABS R4, R4 ;                             /* 0x0000000400047213 */
                                                                             /* 0x000fe20000000000 */
        /*0140*/                   IMAD.IADD R2, R5, 0x1, -R2 ;              /* 0x0000000105027824 */
                                                                             /* 0x000fc600078e0a02 */
        /*0150*/                   IABS R5, R0 ;                             /* 0x0000000000057213 */
                                                                             /* 0x000fe20000000000 */
        /*0160*/                   IMAD.MOV.U32 R0, RZ, RZ, R4 ;             /* 0x000000ffff007224 */
                                                                             /* 0x000fe200078e0004 */
        /*0170*/                   IABS R6, R2 ;                             /* 0x0000000200067213 */
                                                                             /* 0x000fe20000000000 */
        /*0180*/                   IMAD.MOV.U32 R2, RZ, RZ, R3 ;             /* 0x000000ffff027224 */
                                                                             /* 0x000fca00078e0003 */
        /*0190*/                   IADD3 R0, R0, c[0x0][0x170], R2 ;         /* 0x00005c0000007a10 */
                                                                             /* 0x000fe20007ffe002 */
        /*01a0*/                   IMAD.MOV.U32 R2, RZ, RZ, c[0x0][0x160] ;  /* 0x00005800ff027624 */
                                                                             /* 0x000fe400078e00ff */
        /*01b0*/                   IMAD.MOV.U32 R3, RZ, RZ, c[0x0][0x164] ;  /* 0x00005900ff037624 */
                                                                             /* 0x000fe200078e00ff */
        /*01c0*/                   IADD3 R0, R5, R0, R6 ;                    /* 0x0000000005007210 */
                                                                             /* 0x000fd00007ffe006 */
        /*01d0*/                   STG.E.SYS [R2], R0 ;                      /* 0x0000000002007386 */
                                                                             /* 0x000fe2000010e900 */
        /*01e0*/                   EXIT ;                                    /* 0x000000000000794d */
                                                                             /* 0x000fea0003800000 */
        /*01f0*/                   BRA 0x1f0;                                /* 0xfffffff000007947 */
                                                                             /* 0x000fc0000383ffff */
		...........................

There is a high chance that the instruction was removed from silicon. In this case you are out of luck.

The simplification of the cuda multiprocessors started after the GTX 780Ti, Titan 6GB, Titan Black (GK110, full Kepler with Compute 3.5) which had a really complex feature+instruction set.

To save power, die space and improve efficiency the Maxwell architecture removed out a lot of seldom used instructions (e.g. parts of the video instruction set). Instead the chips were equipped better hardware video codecs. It is likely that nVidia made similar decisions going from Pascal to Volta.

According to the CUDA programming guide section 5.4.1. Arithmetic Instructions Table 2. Throughput of Native Arithmetic Instructions. (Number of Results per Clock Cycle per Multiprocessor)

we should expect 64 results per clock cycle per multiprocessor for vabsdiff4.

This makes me think the instruction should be supported in hardware on sm_70, unless the documentation is buggy.

But then for Compute 6.1 it says “Multiple instruct.” in that same table - which is in direct contradiction to the snippet you pasted above where we find a VABSDIFF4.U8.U8.ACC instruction.

This confuses me. Kindly requesting a deus ex machina from the gods of nVidia. ;)

I assume it is a documentation bug at this point. I have filed an internal bug at NVIDIA to have it looked at. Since this is almost certainly a documentation error, I think you should assume that the compiler behavior is correct.

I don’t know of a way to produce the vabsdiff4 SASS instruction on sm_70 architecture.

If you just need the straightforward byte-wise absolute difference without any special modes that the VABSDIFF4 instruction may offer, try the device function intrinsic __vabsdiff4u4() instead of using inline PTX. This may provide a faster emulation as the intrinsic does not need to mimic any special modes the actual PTX instruction may have (i.e. offers potentially reduced functionality in exchange for potentially higher performance).

I see your inline assembly uses a .add suffix, which indicates that you may want the __vsadu4() intrinsic instead (i.e. sum of absolute differences), but I am not 100% sure as I haven’t used these instructions in years.

This recommendation can be generalized: Whereever possible from a functionality perspective, use the SIMD device function instrinsics rather than invoking SIMD video instructions via inline PTX.

Thank you so much everyone.

I understood that there is a high possibility of a bug in documentation of vabsdiff4.
I think consider replacing the inline assembler with a SIMD device function instrinsics.

Best regards.

In addition to trying the intrinsic __vsadu4() you may also want to experiment which of the 16 possible combinations below for implementing a vsadu4 emulation is the fastest in the context of your code on your platform. Selecting from among the variants creates instruction sequences of different length and with different instruction type mixes.

I would be interested to hear which combination works best on your Volta platform, as I find it impossible to predict.

#define SETLTU_VARIANT     0   // chose 0 or 1
#define CMPLTU_VARIANT     0   // chose 0 or 1
#define VABSDIFFU_VARIANT  0   // chose 0 or 1
#define VSADU_VARIANT      0   // chose 0 or 1

#define UINT32_H4  0x80808080U

static __device__ uint32_t sign_to_bool4 (uint32_t a)
{
    return (a & UINT32_H4) >> 7;
}

static __device__ uint32_t sign_to_mask4 (uint32_t a)
{
    asm ("prmt.b32 %0,%0,0,0xba98;" : "+r"(a));
    return a;
}

static __device__ uint32_t bool_to_mask4 (uint32_t a)
{
    return (a << 8) - a;
}

static __device__ uint32_t vhaddu4 (uint32_t a, uint32_t b)
{
    /* Peter L. Montgomery's observation (newsgroup comp.arch, 2000/02/11,
       https://groups.google.com/d/msg/comp.arch/gXFuGZtZKag/_5yrz2zDbe4J):
       (A+B)/2 = (A AND B) + (A XOR B)/2.
    */
    return (a & b) + (((a ^ b) >> 1) & ~UINT32_H4);
}

static __device__ uint32_t ltu4_core (uint32_t a, uint32_t b)
{
    /* Sebastiano Vigna, "Broadword implementation of rank/select queries." 
       In: International Workshop on Experimental and Efficient Algorithms, 
       pp. 154-168, Springer Berlin Heidelberg, 2008.
    */
    return (((a | UINT32_H4) - (b & ~UINT32_H4)) | (a ^ b)) ^ (a | ~b);
}

static __device__ uint32_t vsetltu4 (uint32_t a, uint32_t b)
{
#if SETLTU_VARIANT
    return sign_to_bool4 (ltu4_core (a, b));
#else
    return sign_to_bool4 (vhaddu4 (~a, b));
#endif
}

static __device__ uint32_t vcmpltu4 (uint32_t a, uint32_t b)
{
#if CMPLTU_VARIANT
    return bool_to_mask4 (vsetltu4 (a, b));
#else
    return sign_to_mask4 (ltu4_core (a, b));
#endif
}

static __device__ uint32_t vabsdiffu4 (uint32_t a, uint32_t b)
{
    uint32_t t = vcmpltu4 (a, b);
#if VABSDIFFU_VARIANT
    return ((b & t) | (a & ~t)) - ((a & t) | (b & ~t));
#else
    t = (a ^ b) & t;
    return (a ^ t) - (b ^ t);
#endif
}

static __device__ uint32_t vsadu4 (uint32_t a, uint32_t b)
{
    uint32_t r, s;
    r = vabsdiffu4 (a, b);
#if VSADU_VARIANT
    s = r >> 8;
    r = (r & 0x00ff00ff) + (s & 0x00ff00ff);
    r = ((r << 16) + r) >> 16;
#else
    r = (r & 0xff) + ((r >> 8) & 0xff) + ((r >> 16) & 0xff) + ((r >> 24) & 0xff);
#endif
    return r;
}

Thanks njuffa.

I tried the vsadu4 description of 16 patterns16 patterns by my system.
As a result, the following combination was the fastest.

  • SETLTU_VARIANT 0
  • CMPLTU_VARIANT 0
  • VABSDIFFU_VARIANT 1
  • VSADU_VARIANT 1

vcmpltu4 tended to be faster using the “prmt” inline asembler.
By vabsdiff4, bit inversion is faster than exclusive-OR.
By SAD, it was faster to add two “4 byte values” at a time.

Thank you very much for the feedback. Very interesting. Is emulation via the fastest combination you found faster than CUDA’s built-in intrinsic __vsadu4() by any chance?