Hi,
It is possible to rewrite below lines in a way to do the operations by tensor core and be faster?
#pragma unroll
for(int i = 0; i < N; i++){
rA[i] = dA[ i * ldda + tx ];
}
double vab;
#pragma unroll
for(int i = 0; i < N; i++){
if(tx == i){
#pragma unroll
for(int j = 0; j < N; j++)
sx[j] = rA[j];
}
__syncthreads();
vab=( (1) / (sx[i] ) )
if( tx > i ){
rA[i] *= vab;
#pragma unroll
for(int j = i+1; j < N; j++){
rA[j] -= rA[i] * sx[j];
}
}