CUDA 9.1 AlphaComp shows strange behavior

By nppiAlphaComp_8u_AC4R of CUDA 9.1 on Windows x64, I composited two 100% white images with 100% opaque alpha, in short #FFFFFFFF, and I got a mixture of #FFFFFF and #FEFEFE. It looks strange behavior. Thanks.

import numpy as np
import cv2
import ctypes as ct
import numpy.ctypeslib as npct
from enum import IntEnum

DLL_PATH = r"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.1\bin\\"
WIDTH, HEIGHT = 512, 512
N_CH = 4

def _loadl_dll(name):
    return npct.load_library(DLL_PATH + name, ".")

_dll_rt = _loadl_dll("cudart64_91.dll")

class _cudaMemcpyKind(IntEnum):
    cudaMemcpyDeviceToHost = 2

_dll_rt.cudaMemcpy.argtypes = [ct.POINTER(ct.c_uint8), ct.POINTER(ct.c_uint8), ct.c_int32, ct.c_int32]
def memcpy_d2h(pt_src, dst):
    return _dll_rt.cudaMemcpy(dst.ctypes.data_as(ct.POINTER(ct.c_uint8)), pt_src, dst.nbytes,
                             ct.c_int32(_cudaMemcpyKind.cudaMemcpyDeviceToHost))

_dll_rt.cudaMemset.argtypes = [ct.POINTER(ct.c_uint8), ct.c_int32, ct.c_int32]
def memset(pt_dst, val, count):
    return _dll_rt.cudaMemset(pt_dst, val, count)


_dll_sf = _loadl_dll("npps64_91.dll")

_dll_sf.nppsMalloc_8u.argtypes = [ct.c_int32]
_dll_sf.nppsMalloc_8u.restype = ct.POINTER(ct.c_uint8)
def malloc_pt(size):
    return _dll_sf.nppsMalloc_8u(size)

def free_pt(pt):
    return _dll_sf.nppsFree(pt)


_dll_al = _loadl_dll("nppial64_91.dll")

class NppiSize(ct.Structure):
    _fields_ = [("width", ct.c_int), ("height", ct.c_int)]

class _NppiAlphaOp(IntEnum):
    NPPI_OP_ALPHA_OVER  = 0

_dll_al.nppiAlphaComp_8u_AC4R.argtypes = [ct.POINTER(ct.c_uint8), ct.c_int32, ct.POINTER(ct.c_uint8), ct.c_int32, ct.POINTER(ct.c_uint8), ct.c_int32, NppiSize, ct.c_int32]
def alpha_comp(pt_src_upper, pt_src_lower, pt_dst, step, width, height):
    return  _dll_al.nppiAlphaComp_8u_AC4R(
            pt_src_upper, step,
            pt_src_lower, step,
            pt_dst, step,
            NppiSize(width, height),
            _NppiAlphaOp.NPPI_OP_ALPHA_OVER)


def alloc_white():
    size = WIDTH * HEIGHT * N_CH
    pt = malloc_pt(size)
    memset(pt, 255, size)
    return pt

def show(caption, pt):
    cv2buf = np.zeros(shape=[WIDTH, HEIGHT], dtype=np.uint8)
    memcpy_d2h(pt, cv2buf)
    cv2.imshow(caption, cv2buf)
    cv2.waitKey(0)
    cv2.destroyAllWindows()

def save(filename, pt):
    cv2buf = np.zeros(shape=[WIDTH, HEIGHT], dtype=np.uint8)
    memcpy_d2h(pt, cv2buf)
    cv2.imwrite(filename, cv2buf)

pt_white1 = alloc_white()
pt_white2 = alloc_white()

show("source", pt_white1)
save("source.png", pt_white1)

pt_dst = malloc_pt(WIDTH * HEIGHT * N_CH)

alpha_comp(pt_white1, pt_white2,pt_dst, WIDTH * N_CH, WIDTH, HEIGHT)

show("result", pt_dst)
save("result.png", pt_dst)

free_pt(pt_white1)
free_pt(pt_white2)
free_pt(pt_dst)