cublasGemmEx execution error code CUBLAS_STATUS_ARCH_MISMATCH

My problem is similar to https://devtalk.nvidia.com/default/topic/1023896/cublasgemmex-doesn-t-work-with-int8-utilizing-__dp4a-instruction-on-nvidia-1080ti-

env:
CUDA version: 8.0.
Device : Tesla T4, compute SM 7.5. compute 7

compilation command:
nvcc -std=c++11 -arch=sm_61 gemmex_test.cu -L/usr/local/cuda-8.0/lib64/ -lcublas

I am getting below error:
CUBLAS_STATUS_ARCH_MISMATCH

What’s wrong with my cublasGemmEx use on Tesla T4?

#include <iostream>

#include <cublas_v2.h>
#include <thrust/device_vector.h>

const char* cublasGetErrorString(cublasStatus_t status) {
  switch(status) {
    case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS";
    case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED";
    case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED";
    case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE";
    case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH";
    case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR";
    case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED";
    case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR";
  }
  return "unknown error";
}

int main(void) {
  // matrix A
  int rowA = 40;
  int colA = 40;
  // matrix B
  int rowB = colA;
  int colB = 40;
  // matrix C
  int rowC = rowA;
  int colC = colB;

  thrust::device_vector<unsigned char> A(rowA * colA);
  thrust::device_vector<unsigned char> B(rowB * colB);
  thrust::device_vector<unsigned int> C(rowC * colC);

  for (size_t i = 0; i < rowA; i++){
    for (size_t j = 0; j < colA; j++){
      A[i * rowA + j] = i + j;
    }
  }

  for (size_t i = 0; i < rowB; i++){
    for (size_t j = 0; j < colB; j++){
      B[i * rowA + j] = i + j;
    }
  }

  for (size_t i = 0; i < rowC; i++) {
    for (size_t j = 0; j < colC; j++) {
      C[i * rowA + j] = i + j;
      if (i == 0) {
        std::cout << " " << C[i * rowA + j];
      }
    }
  }
  std::cout << std::endl;

  cublasHandle_t handle;
  cublasStatus_t status = cublasCreate(&handle);
  if (status != CUBLAS_STATUS_SUCCESS) {
    std::cerr << "cublasCreate failed. error is: " << cublasGetErrorString(status) << std::endl;;
  }

  int alpha = 1;
  int beta = 0;
  // A * B + C
  status = cublasGemmEx(handle, CUBLAS_OP_N, CUBLAS_OP_N,
      rowA, colB, colA,
      &alpha, thrust::raw_pointer_cast(&A[0]), CUDA_R_8I, rowA,
      thrust::raw_pointer_cast(&B[0]), CUDA_R_8I, colB,
      &beta, thrust::raw_pointer_cast(&C[0]), CUDA_R_32I, colB, CUDA_R_32I, CUBLAS_GEMM_ALGO0);
  if (status != CUBLAS_STATUS_SUCCESS) {
    std::cerr << "cublasGemmEx execution error is: " << cublasGetErrorString(status) << std::endl;
  }

  std::cout << "output print: " << std::endl;
  for (size_t i = 0; i < rowC; i++) {
    for (size_t j = 0; j < colC; j++) {
      C[i * rowA + j] = i + j;
      if (i == 0) {
        std::cout << " " << C[i * rowA + j];
      }
    }
  }
  std::cout << std::endl;

  status = cublasDestroy(handle);
  if (status != CUBLAS_STATUS_SUCCESS) {
    std::cerr << "shutdown error code is: " << cublasGetErrorString(status) << std::endl;
  }

  return 0;
}

I upgraded the cuda version to cuda10.0 solve the problem. https://docs.nvidia.com/deeplearning/sdk/cudnn-support-matrix/index.html