Rewriting the scaling and multiplication using the tensor core

Hi,

It is possible to rewrite below lines in a way to do the operations by tensor core and be faster?

    #pragma unroll
    for(int i = 0; i < N; i++){
        rA[i] = dA[ i * ldda + tx ];
    }

   double vab; 

    #pragma unroll
    for(int i = 0; i < N; i++){
        if(tx == i){
            #pragma unroll
            for(int j = 0; j < N; j++)
                sx[j] = rA[j];
        }
      
        __syncthreads();

      vab=( (1) / (sx[i] ) )
        if( tx > i ){
            rA[i] *= vab;
            #pragma unroll
            for(int j = i+1; j < N; j++){
                rA[j] -= rA[i] * sx[j];
            }
        }