My problem is similar to https://devtalk.nvidia.com/default/topic/1023896/cublasgemmex-doesn-t-work-with-int8-utilizing-__dp4a-instruction-on-nvidia-1080ti-
env:
CUDA version: 8.0.
Device : Tesla T4, compute SM 7.5. compute 7
compilation command:
nvcc -std=c++11 -arch=sm_61 gemmex_test.cu -L/usr/local/cuda-8.0/lib64/ -lcublas
I am getting below error:
CUBLAS_STATUS_ARCH_MISMATCH
What’s wrong with my cublasGemmEx use on Tesla T4?
#include <iostream>
#include <cublas_v2.h>
#include <thrust/device_vector.h>
const char* cublasGetErrorString(cublasStatus_t status) {
switch(status) {
case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS";
case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED";
case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED";
case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE";
case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH";
case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR";
case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED";
case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR";
}
return "unknown error";
}
int main(void) {
// matrix A
int rowA = 40;
int colA = 40;
// matrix B
int rowB = colA;
int colB = 40;
// matrix C
int rowC = rowA;
int colC = colB;
thrust::device_vector<unsigned char> A(rowA * colA);
thrust::device_vector<unsigned char> B(rowB * colB);
thrust::device_vector<unsigned int> C(rowC * colC);
for (size_t i = 0; i < rowA; i++){
for (size_t j = 0; j < colA; j++){
A[i * rowA + j] = i + j;
}
}
for (size_t i = 0; i < rowB; i++){
for (size_t j = 0; j < colB; j++){
B[i * rowA + j] = i + j;
}
}
for (size_t i = 0; i < rowC; i++) {
for (size_t j = 0; j < colC; j++) {
C[i * rowA + j] = i + j;
if (i == 0) {
std::cout << " " << C[i * rowA + j];
}
}
}
std::cout << std::endl;
cublasHandle_t handle;
cublasStatus_t status = cublasCreate(&handle);
if (status != CUBLAS_STATUS_SUCCESS) {
std::cerr << "cublasCreate failed. error is: " << cublasGetErrorString(status) << std::endl;;
}
int alpha = 1;
int beta = 0;
// A * B + C
status = cublasGemmEx(handle, CUBLAS_OP_N, CUBLAS_OP_N,
rowA, colB, colA,
&alpha, thrust::raw_pointer_cast(&A[0]), CUDA_R_8I, rowA,
thrust::raw_pointer_cast(&B[0]), CUDA_R_8I, colB,
&beta, thrust::raw_pointer_cast(&C[0]), CUDA_R_32I, colB, CUDA_R_32I, CUBLAS_GEMM_ALGO0);
if (status != CUBLAS_STATUS_SUCCESS) {
std::cerr << "cublasGemmEx execution error is: " << cublasGetErrorString(status) << std::endl;
}
std::cout << "output print: " << std::endl;
for (size_t i = 0; i < rowC; i++) {
for (size_t j = 0; j < colC; j++) {
C[i * rowA + j] = i + j;
if (i == 0) {
std::cout << " " << C[i * rowA + j];
}
}
}
std::cout << std::endl;
status = cublasDestroy(handle);
if (status != CUBLAS_STATUS_SUCCESS) {
std::cerr << "shutdown error code is: " << cublasGetErrorString(status) << std::endl;
}
return 0;
}