I don’t see it. Here is my test case, CUDA 12.0:
$ cat t1.cu
#include <cublas_v2.h>
int main(){
cublasHandle_t handle;
cublasCreate(&handle);
float *d_A = 0, *d_B = 0, *d_C = 0, alpha = 1.0f;
const int N = 1024;
const int n2 = N*N;
cudaMalloc(reinterpret_cast<void **>(&d_A), n2 * sizeof(d_A[0]));
cudaMalloc(reinterpret_cast<void **>(&d_B), n2 * sizeof(d_B[0]));
cudaMalloc(reinterpret_cast<void **>(&d_C), n2 * sizeof(d_C[0]));
float beta = 0.0f;
cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N, &alpha, d_A, N, d_B, N, &beta, d_C, N);
cudaDeviceSynchronize();
}
$ nvcc -o t1 t1.cu -lcublas
$ nsys profile --stats=true ./t1
<snip>
[6/8] Executing 'gpukernsum' stats report
Time (%) Total Time (ns) Instances Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) GridXYZ BlockXYZ Name
-------- --------------- --------- --------- --------- -------- -------- ----------- -------------- -------------- ----------------------
100.0 149,152 1 149,152.0 149,152.0 149,152 149,152 0.0 8 16 5 128 1 1 ampere_sgemm_128x64_nn
<snip>
$ sudo /usr/local/cuda/bin/ncu ./t1
==PROF== Connected to process 43469 (/home/.../t1)
==PROF== Profiling "ampere_sgemm_128x64_nn" - 0: 0%....50%....100% - 10 passes
==PROF== Disconnected from process 43469
[43469] t1@127.0.0.1
ampere_sgemm_128x64_nn (8, 16, 5)x(128, 1, 1), Context 1, Stream 7, Device 0, CC 8.0
<snip>
$ nvidia-smi
Wed Mar 8 17:55:14 2023
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12 Driver Version: 525.85.12 CUDA Version: 12.0 |
|-------------------------------+----------------------+----------------------+
| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|===============================+======================+======================|
| 0 NVIDIA A100-SXM... On | 00000000:01:00.0 Off | 0 |
| N/A 30C P0 54W / 400W | 0MiB / 40960MiB | 0% Default |
| | | Disabled |
+-------------------------------+----------------------+----------------------+
| 1 NVIDIA A100-SXM... On | 00000000:41:00.0 Off | 0 |
| N/A 28C P0 50W / 400W | 0MiB / 40960MiB | 0% Default |
| | | Disabled |
+-------------------------------+----------------------+----------------------+
| 2 NVIDIA A100-SXM... On | 00000000:81:00.0 Off | 0 |
| N/A 28C P0 49W / 400W | 0MiB / 40960MiB | 0% Default |
| | | Disabled |
+-------------------------------+----------------------+----------------------+
| 3 NVIDIA A100-SXM... On | 00000000:C1:00.0 Off | 0 |
| N/A 27C P0 49W / 400W | 0MiB / 40960MiB | 0% Default |
| | | Disabled |
+-------------------------------+----------------------+----------------------+
+-----------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=============================================================================|
| No running processes found |
+-----------------------------------------------------------------------------+
Perhaps your nsys profiler output is not actually corresponding to that particular call, but something else in your code?
Otherwise please provide a complete test case, just as I have done, including CUDA version.