let me say more in detail:
I am using cublasGemmStridedBatchedEx like this:
cublasSetMathMode(*handle, CUBLAS_TENSOR_OP_MATH);
cublasGemmStridedBatchedEx(*handle, CUBLAS_OP_N, CUBLAS_OP_N, mc, nc, ma, (void*)&alpha, b, CUDA_R_16F, mb, strideB, a, CUDA_R_16F, ma, strideA, (void*)&beta, c, CUDA_R_16F, mc, strideC, count, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP);
cublasSetMathMode(*handle, CUBLAS_DEFAULT_MATH);
where
ma, na, mb, nb, mc, nc, count are int
strideA, strideB, strideC are long long
alpha, beta are float
When I wrote a simple matrix multiplication test just like this:
halfc = MatrixMulBatched (halfa, X_NOTRANS, halfb, X_NOTRANS);
My rusult didn’t have any problems, and I did get a speed boost.
But when I use cublasGemmStridedBatchedEx in my Neural Language Model,the error is:
cudaMemcpy error (cudaMemcpyDeviceToDevice)
then I use the cuda-memcheck to analyze this error, then I got the information:
========= Invalid __global__ read of size 16
========= at 0x00000870 in volta_fp16_s884gemm_fp16_64x64_ldg8_f2f_nn
========= by thread (75,0,0) in block (0,0,1)
========= Address 0x7f51f5971a68 is misaligned
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame:/lib64/libcuda.so.1 (cuLaunchKernel + 0x2cd) [0x24f88d]
========= Host Frame:/home/linye/cuda-9.2/lib64/libcudart.so.9.2 [0x168fd]
========= Host Frame:/home/linye/cuda-9.2/lib64/libcudart.so.9.2 [0x16987]
========= Host Frame:/home/linye/cuda-9.2/lib64/libcudart.so.9.2 (cudaLaunchKernel + 0x225) [0x448c5]
========= Host Frame:./niutrans.network.gpu (_Z90__device_stub__Z42volta_fp16_s884gemm_fp16_64x64_ldg8_f2f_nnPvS_S_mmllmmmjjjjjjS_jjS_S_ffiPvS_S_mmllmmmjjjjjjS_jjS_S_ffi + 0x265) [0x732ad5]
========= Host Frame:./niutrans.network.gpu (_Z8run_gemmIfEiR13cublasContextR14cublasStatus_t10gemmType_t17cublasOperation_tS5_iiiPKT_S8_PKv14cudaDataType_tiSA_SB_iPvSB_i12shapeTypeC_tbiibmmmPKciii + 0x5a0) [0x5b63a0]
========= Host Frame:./niutrans.network.gpu (cublasGemmStridedBatchedEx + 0x1b8a) [0x5af58a]
The implementation of my code in this part is like this:
/*
matrix multiplication of the two tensors
optimized for GPU
for each 2-dimensional data array in a (denoted as ai) and
each 2-dimensional data array in b (denoted as bi), we have
ci = trans(ai) * trans(bi) * alpha + cm * beta
where trans() returns the transposed matrix if the flag is fired
>> a - tensor a
>> transposedA - indicates whether the matrices in a are transposed
>> b - tensor b
>> transposedB - indicates whether the matrices in b are transposed
>> c - where we keep a*b
>> alpha - a coefficient
>> beta - another coefficient
*/
void _MatrixMulBatchedGPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
const XTensor * b, MATRIX_TRANS_TYPE transposedB,
XTensor * c, DTYPE alpha, DTYPE beta)
{
#ifdef USE_CUDA
CheckNTErrors((a && b && c), "Empty input tensors!");
CheckNTErrors((a->dataType == b->dataType && a->dataType == c->dataType),
"Input tensors should have the same data type!");
CheckNTErrors((a->order >= 2 && b->order >= 2 && c->order >= 2),
"Input tensors must have a order >= 2!");
CheckNTErrors((a->order == b->order && a->order == c->order),
"Input tensor and output tensor must have same order!");
CheckNTErrors(a->devID >= 0 && b->devID >= 0 && c->devID >= 0, "The tensors must be on GPUs");
int an = transposedA == X_TRANS ? a->dimSizeRDI[0] : a->dimSizeRDI[1];
int am = transposedA == X_TRANS ? a->dimSizeRDI[1] : a->dimSizeRDI[0];
int bn = transposedB == X_TRANS ? b->dimSizeRDI[0] : b->dimSizeRDI[1];
int bm = transposedB == X_TRANS ? b->dimSizeRDI[1] : b->dimSizeRDI[0];
int cn = c->dimSizeRDI[1];
int cm = c->dimSizeRDI[0];
CheckNTErrors((am == bn && an == cn && bm == cm), "Unmatched tensors in multiplication!");
int aBlockSize = a->dimSizeRDI[0] * a->dimSizeRDI[1];
int bBlockSize = b->dimSizeRDI[0] * b->dimSizeRDI[1];
int cBlockSize = c->dimSizeRDI[0] * c->dimSizeRDI[1];
int aRealBlockSize = aBlockSize * a->unitSize;
int bRealBlockSize = bBlockSize * b->unitSize;
int cRealBlockSize = cBlockSize * c->unitSize;
int blockNum = 1;
for (int i = 2; i < a->order; i++) {
CheckNTErrors((a->dimSizeRDI[i] == c->dimSizeRDI[i]), "Incorrect tensor sizes!");
CheckNTErrors((b->dimSizeRDI[i] == c->dimSizeRDI[i]), "Incorrect tensor sizes!");
blockNum *= a->dimSizeRDI[i];
}
int devIDBackup = 0;
ProtectCudaDev(a->devID, devIDBackup);
cublasHandle_t * handle = a->mem != NULL ? a->mem->GetCublasHandle() : GDevs.GetCudaHandle(a->devID);
_CudaBLASMatrixMULBatchedStrided(handle,
a->data, transposedA, a->dataType, aBlockSize,
b->data, transposedB, b->dataType, bBlockSize,
c->data, c->dataType, cBlockSize, blockNum,
a->dimSizeRDI[1], a->dimSizeRDI[0],
b->dimSizeRDI[1], b->dimSizeRDI[0],
c->dimSizeRDI[1], c->dimSizeRDI[0], alpha, beta);
//printf("out\n");
BacktoCudaDev(a->devID, devIDBackup);
#endif
}
/* matrix multiplication in batch and strided mode via cuda version BLAS */
void _CudaBLASMatrixMULBatchedStrided(cublasHandle_t * handle,
const void * a, MATRIX_TRANS_TYPE transposedA, TENSOR_DATA_TYPE dataTypeA, long long int strideA,
const void * b, MATRIX_TRANS_TYPE transposedB, TENSOR_DATA_TYPE dataTypeB, long long int strideB,
void * c, TENSOR_DATA_TYPE dataTypeC, long long int strideC,
int count, int na, int ma, int nb, int mb, int nc, int mc,
DTYPE alpha, DTYPE beta)
{
/*
matrxi-matrix multiplication
For row-major matrices (as in c/c++), the trick used here is (AB)^T = B^T * A^T
*/
if (dataTypeA == X_DOUBLE && dataTypeB == X_DOUBLE && dataTypeC == X_DOUBLE) {
ShowNTErrors("TO DO!"); }
else if (dataTypeA == X_FLOAT && dataTypeB == X_FLOAT && dataTypeC == X_FLOAT) {
ShowNTErrors("TO DO!"); }
else if (dataTypeA == X_FLOAT16 && dataTypeB == X_FLOAT16 && dataTypeC == X_FLOAT16) {
unsigned short alpha2 = FloatToFloat16(alpha);
unsigned short beta2 = FloatToFloat16(beta);
__half * alpha3 = (__half*)&alpha2;
__half * beta3 = (__half*)&beta2;
if (transposedA == X_NOTRANS && transposedB == X_NOTRANS) {
cublasSetMathMode(*handle, CUBLAS_TENSOR_OP_MATH);
cublasGemmStridedBatchedEx(*handle, CUBLAS_OP_N, CUBLAS_OP_N, mc, nc, ma, (void*)&alpha, b, CUDA_R_16F, mb, strideB, a, CUDA_R_16F, ma, strideA, (void*)&beta, c, CUDA_R_16F, mc, strideC, count, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP);
cublasSetMathMode(*handle, CUBLAS_DEFAULT_MATH);
}
else if (transposedA == X_TRANS && transposedB == X_NOTRANS) {
cublasSetMathMode(*handle, CUBLAS_TENSOR_OP_MATH);
cublasGemmStridedBatchedEx(*handle, CUBLAS_OP_N, CUBLAS_OP_T, mc, nc, na, (void*)&alpha, b, CUDA_R_16F, mb, strideB, a, CUDA_R_16F, ma, strideA, (void*)&beta, c, CUDA_R_16F, mc, strideC, count, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP);
cublasSetMathMode(*handle, CUBLAS_DEFAULT_MATH);
}
else if (transposedA == X_NOTRANS && transposedB == X_TRANS) {
cublasSetMathMode(*handle, CUBLAS_TENSOR_OP_MATH);
cublasGemmStridedBatchedEx(*handle, CUBLAS_OP_T, CUBLAS_OP_N, mc, nc, ma, (void*)&alpha, b, CUDA_R_16F, mb, strideB, a, CUDA_R_16F, ma, strideA, (void*)&beta, c, CUDA_R_16F, mc, strideC, count, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP);
cublasSetMathMode(*handle, CUBLAS_DEFAULT_MATH);
}
else if (transposedA == X_TRANS && transposedB == X_TRANS) {
cublasSetMathMode(*handle, CUBLAS_TENSOR_OP_MATH);
cublasGemmStridedBatchedEx(*handle, CUBLAS_OP_T, CUBLAS_OP_T, mc, nc, na, (void*)&alpha, b, CUDA_R_16F, mb, strideB, a, CUDA_R_16F, ma, strideA, (void*)&beta, c, CUDA_R_16F, mc, strideC, count, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP);
cublasSetMathMode(*handle, CUBLAS_DEFAULT_MATH);
}
}
else {
ShowNTErrors("Unsupported data type!");
}
}
I am really confused, why my test cases have passed, and I using the same method in the network will have this problem, what is the problem with the parameters I sent, or what details of my details are not handled?