@Robert_Crovella @striker159 @Curefab

Thanks for your response. Here is a driver to launch the code, which will help you better understand how it works.

In the current code, the size is 8x8, which you can change. I need to execute it for matrices with sizes larger than 32, such as 128 or even larger, but fast execution for size 32 is also appreciated.

Unfortunately I do not have access to run Nsight Compute and I am running it on V100.

```
#include <cuda_runtime.h>
#include <iostream>
#include <vector>
inline int ceildiv(int a, int b) {
return (a + b - 1) / b;
}
template<int N>
__device__
void scalger_device(int m, double*dA, int lda, int *info)
{
const int tx = threadIdx.x;
const int gtx = blockIdx.x * blockDim.x + tx;
double rA[N];
double reg;
__shared__ double s_y[N];
// Load A into registers
#pragma unroll
for (int i = 0; i < N; i++) {
rA[i] = dA[gtx + i * lda];
}
#pragma unroll
for (int i = 0; i < N; i++) {
if (gtx == i) {
#pragma unroll
for (int j = i; j < N; j++) {
s_y[j] = rA[j];
}
}
__syncthreads();
reg = 1/s_y[i];
if (gtx > i) {
rA[i] *= reg;
#pragma unroll
for (int j = i + 1; j < N; j++) {
rA[j] -= rA[i] * s_y[j];
}
}
__syncthreads();
}
// Store to global memory
#pragma unroll
for (int i = 0; i < N; i++) {
dA[gtx + i * lda] = rA[i];
}
}
template<int N>
__global__
void scalger_kernel_native( int m,
double *dA, int lda,
int *info)
{
scalger_device<N>(m, dA, lda, info);
}
int scalger_native(
int m, int n,
double *dA, int lda,
int *info,
cudaStream_t stream)
{
if (n == 0) return 0;
const int tbx = m;
dim3 threads(tbx, 1, 1);
dim3 grid(ceildiv(m, tbx), 1, 1);
cudaError_t err;
switch(n) {
case 8:
scalger_kernel_native<8><<<grid, threads, 0, stream>>>(m, dA, lda, info);
break;
case 16:
scalger_kernel_native<16><<<grid, threads, 0, stream>>>(m, dA, lda, info);
break;
case 32:
scalger_kernel_native<32><<<grid, threads, 0, stream>>>(m, dA, lda, info);
break;
case 64:
scalger_kernel_native<64><<<grid, threads, 0, stream>>>(m, dA, lda, info);
break;
case 128:
scalger_kernel_native<128><<<grid, threads, 0, stream>>>(m, dA, lda, info);
break;
case 256:
scalger_kernel_native<256><<<grid, threads, 0, stream>>>(m, dA, lda, info);
break;
case 512:
scalger_kernel_native<512><<<grid, threads, 0, stream>>>(m, dA, lda, info);
break;
case 1024:
scalger_kernel_native<1024><<<grid, threads, 0, stream>>>(m, dA, lda, info);
break;
}
err = cudaGetLastError();
if (err != cudaSuccess) {
*info = err;
return -1;
}
return 0;
}
int main() {
//Size of the Matrix for test
int m = 8;
int n = 8;
int lda = m;
int info;
//Set dbg=1 to see the Matrix before and after of computation
int dbg=1;
// Create an 8x8 matrix and initialize it
std::vector<double> hA(m * n, 0.0);
if (dbg)std::cout << "Matrix before computation:" << std::endl;
for (int i = 0; i < m; ++i) {
for (int j = 0; j < n; ++j) {
hA[i + j * lda] = i + j+10;
if (i==j){
hA[i + j * lda]+=200;
}
if (dbg) std::cout << hA[i + j * lda] << " ";
}
if (dbg) std::cout << std::endl;
}
double *dA;
// Allocate device memory
cudaMalloc((void**)&dA, m * n * sizeof(double));
// Copy matrix to device
cudaMemcpy(dA, hA.data(), m * n * sizeof(double), cudaMemcpyHostToDevice);
// Create a CUDA stream
cudaStream_t stream;
cudaStreamCreate(&stream);
// Run the function
scalger_native(m, n, dA, lda, &info, stream);
// Synchronize and destroy the stream
cudaStreamSynchronize(stream);
cudaStreamDestroy(stream);
// Copy result back to host
cudaMemcpy(hA.data(), dA, m * n * sizeof(double), cudaMemcpyDeviceToHost);
// Free device memory
cudaFree(dA);
if(dbg){
std::cout << "Matrix after computation:" << std::endl;
for (int i = 0; i < m; ++i) {
for (int j = 0; j < n; ++j) {
std::cout << hA[i + j * lda] << " ";
}
std::cout << std::endl;
}
}
return 0;
}
```

scalger.zip (1.4 KB)