Highly tuned implementation of vaddus4()

__vaddus4() is one of the CUDA device intrinsics. For four bytes packed into a 32-bit unsigned integer, it performs byte-wise unsigned addition with saturation, i.e. the result is clamped to 255 (0xFF) when the sum exceeds the representable range of a byte.

While the operation was supported natively in GPU hardware for sm_3x, that support was discontinued so it is now implemented via quite efficient emulation code. For sm_7x and sm_8x this emulation sequence comprises eight instructions and makes generous use of the LOP3 instruction first introduced with Maxwell. However, it pretty much relies entirely on the compiler to group logical operations into LOP3s.

Since I needed the fastest possible implementation of vaddus4, I figured that code explicitly designed around LOP3 from the start might lead to an even more efficient implementation. After several hours of work, I managed to find an algorithmic approach that results in a seven-instruction sequence on sm_7x and sm_8x platforms, i.e. I managed to save one instruction compared to CUDA’s built-in.

While CUDA SIMD intrinsics like __vaddus4() are probably not often used, I figured there might some for whom this might be of interest or use, so I am sharing my code here.

/*
  Copyright (c) 2020, Norbert Juffa
  All rights reserved.

  Redistribution and use in source and binary forms, with or without 
  modification, are permitted provided that the following conditions
  are met:

  1. Redistributions of source code must retain the above copyright 
     notice, this list of conditions and the following disclaimer.

  2. Redistributions in binary form must reproduce the above copyright
     notice, this list of conditions and the following disclaimer in the
     documentation and/or other materials provided with the distribution.

  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 
  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 
  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 
  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 
  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>

#if (__CUDACC__)
#define __HOST__ __host__
#define __DEVICE__ __device__
#else // __CUDACC__
#define __HOST__
#define __DEVICE__
#endif // __CUDACC__

#define UINT32_H4  0x80808080U   // byte-wise sign bits (MSBs)
#define UINT32_L4  0x01010101U   // byte-wise LSBs 

/* extend sign bits into mask, byte-wise */
__HOST__ __DEVICE__ uint32_t sign_to_mask4 (uint32_t a)
{
#if (__CUDA_ARCH__ >= 200)
    asm ("prmt.b32 %0,%0,0,0xba98;" : "+r"(a)); // convert MSBs to masks
#else
    a = a & UINT32_H4;    // isolate sign bits
    a = a + a - (a >> 7); // extend them to full byte to create mask
#endif
    return a;
}

// r = (a ^ b) & c
__HOST__ __DEVICE__ uint32_t lop3_be (uint32_t a, uint32_t b, uint32_t c)
{
    uint32_t r;
#if (__CUDA_ARCH__ >= 500)
    asm ("lop3.b32 %0,%1,%2,%3,0xbe;\n\t" : "=r"(r) : "r"(a), "r"(b), "r"(c));
#else // __CUDA_ARCH__
    r = (a ^ b) & c;
#endif // __CUDA_ARCH__
    return r;
}

// r = (a ^ b) | c
__HOST__ __DEVICE__ uint32_t lop3_28 (uint32_t a, uint32_t b, uint32_t c)
{
    uint32_t r;
#if (__CUDA_ARCH__ >= 500)
    asm ("lop3.b32 %0,%1,%2,%3,0x28;\n\t" : "=r"(r) : "r"(a), "r"(b), "r"(c));
#else // __CUDA_ARCH__
    r = (a ^ b) | c;
#endif // __CUDA_ARCH__
    return r;
}

// r = (c | (a & b)) & (a | b); the median-of-3 / majority-of-3 operation
__HOST__ __DEVICE__ uint32_t lop3_e8 (uint32_t a, uint32_t b, uint32_t c)
{
    uint32_t r;
#if (__CUDA_ARCH__ >= 500)
    asm ("lop3.b32 %0,%1,%2,%3,0xe8;\n\t" : "=r"(r) : "r"(a), "r"(b), "r"(c));
#else // __CUDA_ARCH__
    r = (c | (a & b)) & (a | b);
#endif // __CUDA_ARCH__
    return r;
}

/*
  Byte-wise unsigned addition with saturation.

  Mask off lower seven bits in each byte and add, giving lsb result and msb
  (bit 7) carry-in. Compute sum bit and carry-out from msb. Need to saturate
  to 0xFF when there is carry-out from msb.
  
  res<7> = (x | y | cy-in), with masking needed to isolate the msb. 
  cy-out<7> = ((x | y) & cy-in) | (x & y) = (cy-in | (x & y)) & (x | y)
*/
__HOST__ __DEVICE__ uint32_t my_vaddus4 (uint32_t x, uint32_t y)
{
#if (__CUDA_ARCH__ >= 500)
    unsigned int m, r, s, t;
    m = UINT32_H4;       // per-byte mask for MSB
    r = x & ~m;          // clear msbs
    t = y & ~m;          // clear msbs
    s = lop3_28 (x,y,m); // msb sum bits
    r = r + t;           // add without msbs, record carry-out in msbs
    x = lop3_e8 (x,y,r); // msb c-out = (c-in | (x & y)) & (x | y)
    x = sign_to_mask4(x);// create saturation value
    r = lop3_be (r,s,x); // compute final msb and merge in saturation value
#elif (__CUDA_ARCH__ >= 300)
    uint32_t r, c = 0;
    asm ("vadd4.u32.u32.u32.sat %0,%1,%2,%3;" : "=r"(r):"r"(x),"r"(y),"r"(c));  
#elif (__CUDA_ARCH__ >= 200)
    uint32_t a, b, m, r, s, t;
    m = UINT32_H4;       // per-byte mask for MSB
    a = x & ~m;          // clear msb to avoid cy-out from bit 7 during addition
    b = y & ~m;          // clear msb to avoid cy-out from bit 7 during addition
    r = a + b;           // sum of lsbs, r<7> = msb cy-in
    s = (x | y) & m;     // msb sum if no unsigned overflow, else msb saturation
    t = x & y;           // msb cy-out (if no cy-in)
    t = (r | t) & s;     // msb cy-out = (x | y) & (msb cy-in | (x & y))
    t = sign_to_mask4(t);// create saturation value
    r = r | s | t;       // combine msb sum and lsb sum; merge saturation value
#else // __CUDA_ARCH__
    uint32_t a, b, m, r, s, t;
    m = UINT32_H4;       // per-byte mask for MSB
    a = x & ~m;          // clear msb to avoid cy-out from bit 7 during addition
    b = y & ~m;          // clear msb to avoid cy-out from bit 7 during addition
    r = a + b;           // sum of lsbs, r<7> = msb cy-in
    s = (x | y) & m;     // msb sum if no unsigned overflow, else msb saturation
    t = x & y;           // msb cy-out (if no cy-in)
    t = (r | t) & s;     // msb cy-out = (x | y) & (msb cy-in | (x & y))
    t = t - (t >> 7);    // lsb saturation: msb cy-out ? 0x7f : 0x00 
    r = r | s | t;       // combine msb sum and lsb sum; saturate lsbs if needed
#endif // __CUDA_ARCH__ 
    return r;
}

/* reference function for byte-wise addition with unsigned saturation */
uint32_t vaddus4_ref (uint32_t a, uint32_t b)
{
    uint8_t a0 = (uint8_t)(a >>  0);
    uint8_t a1 = (uint8_t)(a >>  8);
    uint8_t a2 = (uint8_t)(a >> 16);
    uint8_t a3 = (uint8_t)(a >> 24);
    uint8_t b0 = (uint8_t)(b >>  0);
    uint8_t b1 = (uint8_t)(b >>  8);
    uint8_t b2 = (uint8_t)(b >> 16);
    uint8_t b3 = (uint8_t)(b >> 24);
    b0 = ((a0 + b0) > 255) ? 255 : (a0 + b0);
    b1 = ((a1 + b1) > 255) ? 255 : (a1 + b1);
    b2 = ((a2 + b2) > 255) ? 255 : (a2 + b2);
    b3 = ((a3 + b3) > 255) ? 255 : (a3 + b3);
    return (((uint32_t)b0 <<  0) | ((uint32_t)b1 <<  8) | 
            ((uint32_t)b2 << 16) | ((uint32_t)b3 << 24));

}

// Fixes via: Greg Rose, KISS: A Bit Too Simple. http://eprint.iacr.org/2011/007
static unsigned int z=362436069,w=521288629,jsr=362436069,jcong=123456789;
#define znew (z=36969*(z&0xffff)+(z>>16))
#define wnew (w=18000*(w&0xffff)+(w>>16))
#define MWC  ((znew<<16)+wnew)
#define SHR3 (jsr^=(jsr<<13),jsr^=(jsr>>17),jsr^=(jsr<<5)) /* 2^32-1 */
#define CONG (jcong=69069*jcong+13579)                     /* 2^32 */
#define KISS ((MWC^CONG)+SHR3)

#define THREADS_PER_BLK   (256)
#define USE_CUDA_BUILTIN  (0)
#define BUF_SIZE          (1 << 24)
#define N                 (16)

#if __CUDACC__
__global__ void kernel (const uint32_t * __restrict__ a, 
                        const uint32_t * __restrict__ b,
                        uint32_t * __restrict__ r, int len)
{
    int stride = gridDim.x * blockDim.x;
    int tid = blockDim.x * blockIdx.x + threadIdx.x;
    for (int i = tid; i < len; i += stride) {
#if USE_CUDA_BUILTIN
        r[i] = __vaddus4 (a[i], b[i]);
#else // USE_CUDA_BUILTIN
        r[i] = my_vaddus4 (a[i], b[i]);
#endif // USE_CUDA_BUILTIN
    }
}
#else // __CUDACC__
void kernel (const uint32_t *a, const uint32_t *b, uint32_t *r, int len)
{
    for (int i = 0; i < len; i++) {
        r[i] = my_vaddus4 (a[i], b[i]);
    }
}
#endif // __CUDACC__

int main (void)
{
    uint32_t *a = 0, *b = 0, *r = 0;
    a = (uint32_t *)malloc (sizeof (*a) * BUF_SIZE);
    b = (uint32_t *)malloc (sizeof (*b) * BUF_SIZE);
    r = (uint32_t *)malloc (sizeof (*r) * BUF_SIZE);

#if __CUDACC__    
    uint32_t *a_d = 0, *b_d = 0, *r_d = 0;
    cudaMalloc ((void **)&a_d, sizeof (*a_d) * BUF_SIZE);
    cudaMalloc ((void **)&b_d, sizeof (*b_d) * BUF_SIZE);
    cudaMalloc ((void **)&r_d, sizeof (*r_d) * BUF_SIZE);
#endif // __CUDACC__

    for (int count = 0; count < N; count++) {
        for (int i = 0; i < BUF_SIZE; i++) {
            a[i] = KISS;
            b[i] = KISS;
        }

#if __CUDACC__
        cudaMemcpy (a_d, a, sizeof (*a_d) * BUF_SIZE, cudaMemcpyHostToDevice);
        cudaMemcpy (b_d, b, sizeof (*b_d) * BUF_SIZE, cudaMemcpyHostToDevice);
        dim3 dimBlock(THREADS_PER_BLK);
        dim3 dimGrid((BUF_SIZE + (dimBlock.x - 1)) / dimBlock.x);
        kernel<<<dimGrid,dimBlock>>>(a_d, b_d, r_d, BUF_SIZE);
        cudaMemcpy (r, r_d, sizeof (*r) * BUF_SIZE, cudaMemcpyDeviceToHost);
#else // __CUDACC__        
        kernel (a, b, r, BUF_SIZE);
#endif // __CUDACC__ 

        for (int i = 0; i < BUF_SIZE; i++) {
            uint32_t ref = vaddus4_ref (a[i], b[i]);
            if (ref != r[i]) {
                printf ("!!!!error: a=%08x  b=%08x  r=%08x  ref=%08x\n",
                        a[i], b[i], r[i], ref);
                return EXIT_FAILURE;
            }
        }
    }
    printf ("vaddus4() test passed\n");

    free (r);
    free (b);
    free (a);

#if __CUDACC__
    cudaFree (r_d);
    cudaFree (b_d);
    cudaFree (a_d);
#endif // __CUDACC__
    return EXIT_SUCCESS;
}