how to resolve Invalid __global__ read of size 16 error , I am using cublasGemmStridedBatchedEx

Please help me solve this problem, I am really confused about this problem:

I am using cublasGemmStridedBatchedEx like this:

cublasSetMathMode(*handle, CUBLAS_TENSOR_OP_MATH);
cublasGemmStridedBatchedEx(*handle, CUBLAS_OP_N, CUBLAS_OP_N, mc, nc, ma, (void*)&alpha, b, CUDA_R_16F, mb, strideB, a, CUDA_R_16F, ma, strideA, (void*)&beta, c, CUDA_R_16F, mc, strideC, count, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP);
cublasSetMathMode(*handle, CUBLAS_DEFAULT_MATH);

where
ma, na, mb, nb, mc, nc, count are int
strideA, strideB, strideC are long long
alpha, beta are float

and the error I am facing is(cuda-memcheck) :

========= Invalid global read of size 16
========= at 0x00000870 in volta_fp16_s884gemm_fp16_64x64_ldg8_f2f_nn
========= by thread (75,0,0) in block (0,0,1)
========= Address 0x7f51f5971a68 is misaligned
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame:/lib64/libcuda.so.1 (cuLaunchKernel + 0x2cd) [0x24f88d]
========= Host Frame:/home/linye/cuda-9.2/lib64/libcudart.so.9.2 [0x168fd]
========= Host Frame:/home/linye/cuda-9.2/lib64/libcudart.so.9.2 [0x16987]
========= Host Frame:/home/linye/cuda-9.2/lib64/libcudart.so.9.2 (cudaLaunchKernel + 0x225) [0x448c5]
========= Host Frame:./niutrans.network.gpu (_Z90__device_stub__Z42volta_fp16_s884gemm_fp16_64x64_ldg8_f2f_nnPvS_S_mmllmmmjjjjjjS_jjS_S_ffiPvS_S_mmllmmmjjjjjjS_jjS_S_ffi + 0x265) [0x732ad5]
========= Host Frame:./niutrans.network.gpu (_Z8run_gemmIfEiR13cublasContextR14cublasStatus_t10gemmType_t17cublasOperation_tS5_iiiPKT_S8_PKv14cudaDataType_tiSA_SB_iPvSB_i12shapeTypeC_tbiibmmmPKciii + 0x5a0) [0x5b63a0]
========= Host Frame:./niutrans.network.gpu (cublasGemmStridedBatchedEx + 0x1b8a) [0x5af58a]

let me say more in detail:

I am using cublasGemmStridedBatchedEx like this:

cublasSetMathMode(*handle, CUBLAS_TENSOR_OP_MATH);
cublasGemmStridedBatchedEx(*handle, CUBLAS_OP_N, CUBLAS_OP_N, mc, nc, ma, (void*)&alpha, b, CUDA_R_16F, mb, strideB, a, CUDA_R_16F, ma, strideA, (void*)&beta, c, CUDA_R_16F, mc, strideC, count, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP);
cublasSetMathMode(*handle, CUBLAS_DEFAULT_MATH);

where
ma, na, mb, nb, mc, nc, count are int
strideA, strideB, strideC are long long
alpha, beta are float

When I wrote a simple matrix multiplication test just like this:

halfc = MatrixMulBatched (halfa, X_NOTRANS, halfb, X_NOTRANS);

My rusult didn’t have any problems, and I did get a speed boost.

But when I use cublasGemmStridedBatchedEx in my Neural Language Model,the error is:

cudaMemcpy error (cudaMemcpyDeviceToDevice)

then I use the cuda-memcheck to analyze this error, then I got the information:

========= Invalid __global__ read of size 16
========= at 0x00000870 in volta_fp16_s884gemm_fp16_64x64_ldg8_f2f_nn
========= by thread (75,0,0) in block (0,0,1)
========= Address 0x7f51f5971a68 is misaligned
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame:/lib64/libcuda.so.1 (cuLaunchKernel + 0x2cd) [0x24f88d]
========= Host Frame:/home/linye/cuda-9.2/lib64/libcudart.so.9.2 [0x168fd]
========= Host Frame:/home/linye/cuda-9.2/lib64/libcudart.so.9.2 [0x16987]
========= Host Frame:/home/linye/cuda-9.2/lib64/libcudart.so.9.2 (cudaLaunchKernel + 0x225) [0x448c5]
========= Host Frame:./niutrans.network.gpu (_Z90__device_stub__Z42volta_fp16_s884gemm_fp16_64x64_ldg8_f2f_nnPvS_S_mmllmmmjjjjjjS_jjS_S_ffiPvS_S_mmllmmmjjjjjjS_jjS_S_ffi + 0x265) [0x732ad5]
========= Host Frame:./niutrans.network.gpu (_Z8run_gemmIfEiR13cublasContextR14cublasStatus_t10gemmType_t17cublasOperation_tS5_iiiPKT_S8_PKv14cudaDataType_tiSA_SB_iPvSB_i12shapeTypeC_tbiibmmmPKciii + 0x5a0) [0x5b63a0]
========= Host Frame:./niutrans.network.gpu (cublasGemmStridedBatchedEx + 0x1b8a) [0x5af58a]

The implementation of my code in this part is like this:

/*
matrix multiplication of the two tensors
optimized for GPU

for each 2-dimensional data array in a (denoted as ai) and
each 2-dimensional data array in b (denoted as bi), we have
ci = trans(ai) * trans(bi) * alpha + cm * beta
where trans() returns the transposed matrix if the flag is fired

>> a - tensor a
>> transposedA - indicates whether the matrices in a are transposed
>> b - tensor b
>> transposedB - indicates whether the matrices in b are transposed
>> c - where we keep a*b
>> alpha - a coefficient
>> beta - another coefficient
*/
void _MatrixMulBatchedGPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
                          const XTensor * b, MATRIX_TRANS_TYPE transposedB,
                          XTensor * c, DTYPE alpha, DTYPE beta)
{
#ifdef USE_CUDA

    CheckNTErrors((a && b && c), "Empty input tensors!");
    CheckNTErrors((a->dataType == b->dataType && a->dataType == c->dataType),
                  "Input tensors should have the same data type!");
    CheckNTErrors((a->order >= 2 && b->order >= 2 && c->order >= 2),
                  "Input tensors must have a order >= 2!");
    CheckNTErrors((a->order == b->order && a->order == c->order), 
                  "Input tensor and output tensor must have same order!");
    CheckNTErrors(a->devID >= 0 && b->devID >= 0 && c->devID >= 0, "The tensors must be on GPUs");

    int an = transposedA == X_TRANS ? a->dimSizeRDI[0] : a->dimSizeRDI[1];
    int am = transposedA == X_TRANS ? a->dimSizeRDI[1] : a->dimSizeRDI[0];
    int bn = transposedB == X_TRANS ? b->dimSizeRDI[0] : b->dimSizeRDI[1];
    int bm = transposedB == X_TRANS ? b->dimSizeRDI[1] : b->dimSizeRDI[0];
    int cn = c->dimSizeRDI[1];
    int cm = c->dimSizeRDI[0];

    CheckNTErrors((am == bn && an == cn && bm == cm), "Unmatched tensors in multiplication!");

    int aBlockSize = a->dimSizeRDI[0] * a->dimSizeRDI[1];
    int bBlockSize = b->dimSizeRDI[0] * b->dimSizeRDI[1];
    int cBlockSize = c->dimSizeRDI[0] * c->dimSizeRDI[1];
    int aRealBlockSize = aBlockSize * a->unitSize;
    int bRealBlockSize = bBlockSize * b->unitSize;
    int cRealBlockSize = cBlockSize * c->unitSize;
    int blockNum = 1;

    for (int i = 2; i < a->order; i++) {
        CheckNTErrors((a->dimSizeRDI[i] == c->dimSizeRDI[i]), "Incorrect tensor sizes!");
        CheckNTErrors((b->dimSizeRDI[i] == c->dimSizeRDI[i]), "Incorrect tensor sizes!");
        blockNum *= a->dimSizeRDI[i];
    }

    int devIDBackup = 0;
    ProtectCudaDev(a->devID, devIDBackup);

cublasHandle_t * handle = a->mem != NULL ? a->mem->GetCublasHandle() : GDevs.GetCudaHandle(a->devID);

    _CudaBLASMatrixMULBatchedStrided(handle,
                                     a->data, transposedA, a->dataType, aBlockSize,
                                     b->data, transposedB, b->dataType, bBlockSize,
                                     c->data, c->dataType, cBlockSize, blockNum,
                                     a->dimSizeRDI[1], a->dimSizeRDI[0],
                                     b->dimSizeRDI[1], b->dimSizeRDI[0],
                                     c->dimSizeRDI[1], c->dimSizeRDI[0], alpha, beta);
    //printf("out\n");

    BacktoCudaDev(a->devID, devIDBackup);
#endif
}

/* matrix multiplication in batch and strided mode via cuda version BLAS */
void _CudaBLASMatrixMULBatchedStrided(cublasHandle_t * handle,
                                      const void * a, MATRIX_TRANS_TYPE transposedA, TENSOR_DATA_TYPE dataTypeA, long long int strideA,
                                      const void * b, MATRIX_TRANS_TYPE transposedB, TENSOR_DATA_TYPE dataTypeB, long long int strideB,
                                      void * c, TENSOR_DATA_TYPE dataTypeC, long long int strideC,
                                      int count, int na, int ma, int nb, int mb, int nc, int mc,
                                      DTYPE alpha, DTYPE beta)
{
    /*
    matrxi-matrix multiplication
    For row-major matrices (as in c/c++), the trick used here is (AB)^T = B^T * A^T
    */
    if (dataTypeA == X_DOUBLE && dataTypeB == X_DOUBLE && dataTypeC == X_DOUBLE) {
        ShowNTErrors("TO DO!");    }
   else if (dataTypeA == X_FLOAT && dataTypeB == X_FLOAT && dataTypeC == X_FLOAT) {
        ShowNTErrors("TO DO!");    }
   else if (dataTypeA == X_FLOAT16 && dataTypeB == X_FLOAT16 && dataTypeC == X_FLOAT16) {
        unsigned short alpha2 = FloatToFloat16(alpha);
        unsigned short beta2 = FloatToFloat16(beta);
        __half * alpha3 = (__half*)&alpha2;
        __half * beta3 = (__half*)&beta2;

        if (transposedA == X_NOTRANS && transposedB == X_NOTRANS) {         
            cublasSetMathMode(*handle, CUBLAS_TENSOR_OP_MATH);
            cublasGemmStridedBatchedEx(*handle, CUBLAS_OP_N, CUBLAS_OP_N, mc, nc, ma, (void*)&alpha, b, CUDA_R_16F, mb, strideB, a, CUDA_R_16F, ma, strideA, (void*)&beta, c, CUDA_R_16F, mc, strideC, count, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP);
            cublasSetMathMode(*handle, CUBLAS_DEFAULT_MATH);
        }     
        else if (transposedA == X_TRANS && transposedB == X_NOTRANS) {
            cublasSetMathMode(*handle, CUBLAS_TENSOR_OP_MATH);
            cublasGemmStridedBatchedEx(*handle, CUBLAS_OP_N, CUBLAS_OP_T, mc, nc, na, (void*)&alpha, b, CUDA_R_16F, mb, strideB, a, CUDA_R_16F, ma, strideA, (void*)&beta, c, CUDA_R_16F, mc, strideC, count, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP);
            cublasSetMathMode(*handle, CUBLAS_DEFAULT_MATH);
       }
        else if (transposedA == X_NOTRANS && transposedB == X_TRANS) {
            cublasSetMathMode(*handle, CUBLAS_TENSOR_OP_MATH);
            cublasGemmStridedBatchedEx(*handle, CUBLAS_OP_T, CUBLAS_OP_N, mc, nc, ma, (void*)&alpha, b, CUDA_R_16F, mb, strideB, a, CUDA_R_16F, ma, strideA, (void*)&beta, c, CUDA_R_16F, mc, strideC, count, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP);
            cublasSetMathMode(*handle, CUBLAS_DEFAULT_MATH);
       }
        else if (transposedA == X_TRANS && transposedB == X_TRANS) {
            cublasSetMathMode(*handle, CUBLAS_TENSOR_OP_MATH);
            cublasGemmStridedBatchedEx(*handle, CUBLAS_OP_T, CUBLAS_OP_T, mc, nc, na, (void*)&alpha, b, CUDA_R_16F, mb, strideB, a, CUDA_R_16F, ma, strideA, (void*)&beta, c, CUDA_R_16F, mc, strideC, count, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP);
            cublasSetMathMode(*handle, CUBLAS_DEFAULT_MATH);
       }         
    }
    else {
        ShowNTErrors("Unsupported data type!");
    }
}

I am really confused, why my test cases have passed, and I using the same method in the network will have this problem, what is the problem with the parameters I sent, or what details of my details are not handled?