Cudnn can't use tensorcore

I try to use tensor core to compute int8 2D convolution forward, but it seems that cudnn tends to use non-tensorcore kernels at first. Who can help me, thanks!

My GPU is 3090.

I already set convolution math type to CUDNN_TENSOR_OP_MATH, and cudnnGetConvolutionForwardAlgorithm_v7 gives me available algo with mathType = CUDNN_TENSOR_OP_MATH. But when I call cudnnConvolutionForward, cudnn always invoke cuda core kernel s (same algo) (performance is pretty low).

Only when I set tensor type to CUDNN_TENSOR_NCHW_VECT_C and data type to CUDNN_DATA_INT8x32, which is only supported by tensor core kernels, cudnn uses tensor core kernels.

here is my code:

cudnnTensorDescriptor_t ifmap_desc;
checkCudnn(cudnnCreateTensorDescriptor(&ifmap_desc));
checkCudnn(cudnnSetTensor4dDescriptor(ifmap_desc, CUDNN_TENSOR_NHWC, CUDNN_DATA_INT8, N, IC, IH, IW));

cudnnTensorDescriptor_t ofmap_desc;
checkCudnn(cudnnCreateTensorDescriptor(&ofmap_desc));
checkCudnn(cudnnSetTensor4dDescriptor(ofmap_desc, CUDNN_TENSOR_NHWC, CUDNN_DATA_INT8, N, OC, OH, OW));

cudnnFilterDescriptor_t kernel_desc;
checkCudnn(cudnnCreateFilterDescriptor(&kernel_desc));
checkCudnn(cudnnSetFilter4dDescriptor(kernel_desc, CUDNN_DATA_INT8, CUDNN_TENSOR_NHWC, OC, IC, KH, KW));

cudnnConvolutionDescriptor_t conv_desc;
checkCudnn(cudnnCreateConvolutionDescriptor(&conv_desc));
checkCudnn(cudnnSetConvolution2dDescriptor(conv_desc, PH, PW, SH, SW, 1, 1, CUDNN_CROSS_CORRELATION, CUDNN_DATA_INT32));
checkCudnn(cudnnSetConvolutionMathType(conv_desc, CUDNN_TENSOR_OP_MATH));

int algo_total;
checkCudnn(cudnnGetConvolutionForwardAlgorithmMaxCount(handle, &algo_total));

printf("INFO: cudnn iconv algo total = %d\n", algo_total);

int algo_found;
cudnnConvolutionFwdAlgoPerf_t *algo_perf = (cudnnConvolutionFwdAlgoPerf_t *)malloc(algo_total * sizeof(cudnnConvolutionFwdAlgoPerf_t));
checkCudnn(cudnnGetConvolutionForwardAlgorithm_v7(handle, ifmap_desc, kernel_desc, conv_desc, ofmap_desc, algo_total, &algo_found, algo_perf));

if (!algo_found)
    printf("ERROR: cudnn iconv algo not found\n");
else {
    printf("INFO: cudnn iconv found %d algo\n", algo_found);
    for (int i = 0; i < algo_found; i++)
        printf("Algo = %d, Time = %f ms, Memory = %ld Bytes, Determinism = %d, MathType = %d\n",
            algo_perf[i].algo, algo_perf[i].time, algo_perf[i].memory, algo_perf[i].determinism, algo_perf[i].mathType);
}

cudnnConvolutionFwdAlgo_t algo = algo_perf[0].algo;

size_t workspace_size;
checkCudnn(cudnnGetConvolutionForwardWorkspaceSize(handle, ifmap_desc, kernel_desc, conv_desc, ofmap_desc, algo, &workspace_size));
printf("INFO: cudnn conv workspace size is %ld\n", workspace_size);

void *workspace = NULL;
checkCuda(cudaMalloc(&workspace, workspace_size));

float alpha = 1.0f, beta = 0.0f;
checkCudnn(cudnnConvolutionForward(handle,
    &alpha, ifmap_desc, ifmap, kernel_desc, kernel, conv_desc, algo, workspace, workspace_size, &beta, ofmap_desc, ofmap));

Output:

INFO: cudnn iconv algo total = 10
INFO: cudnn iconv found 10 algo
Algo = 1, Time = -1.000000 ms, Memory = 3934336 Bytes, Determinism = 1, MathType = 1
Algo = 1, Time = -1.000000 ms, Memory = 3934336 Bytes, Determinism = 1, MathType = 0
Algo = 0, Time = -1.000000 ms, Memory = 0 Bytes, Determinism = 1, MathType = 0
Algo = 2, Time = -1.000000 ms, Memory = 0 Bytes, Determinism = 1, MathType = 0
Algo = 5, Time = -1.000000 ms, Memory = 0 Bytes, Determinism = 1, MathType = 0
Algo = 4, Time = -1.000000 ms, Memory = 0 Bytes, Determinism = 1, MathType = 0
Algo = 7, Time = -1.000000 ms, Memory = 0 Bytes, Determinism = 1, MathType = 0
Algo = 6, Time = -1.000000 ms, Memory = 0 Bytes, Determinism = 1, MathType = 0
Algo = 3, Time = -1.000000 ms, Memory = 0 Bytes, Determinism = 1, MathType = 0
Algo = 7, Time = -1.000000 ms, Memory = 0 Bytes, Determinism = 1, MathType = 1

So tensor core kernels (mathtype=1) are supported! But I can only specify algo in cudnnConvolutionForward, and cudnn will automatically use cuda core kernels. How can I make cudnn use tensor core kernels?