Device not found for Shared cublas, but found for static cublas_static

Hi, there, below is a sample code that illustrates the problem I’m about describe.

When using the cublas primitives, the cuda device can’t be found during execution; the cuda error is 100.

However, when I link with the static version, the code works fine. BTW, python code also works just fine in my setup.

Compile parameter that work. If you remove the _static, it doesn’t work.

nvcc -O3 -I …/…/utils --use_fast_math cublas_amin_example.cu -lcublas_static -lcublasLt_static -lculibos -o a.out

include <stdio.h>
include <cublas_v2.h>
include <cuda_runtime.h>

using data_type = double;

define CUBLAS_CHECK(x) {int y = (x) ; printf(“%s : %d\n”, #x, y);}

define CUDA_CHECK(x) {int y = (x) ; printf(“%s : %d\n”, #x, y);}

int main(int argc, char *argv) {
cublasHandle_t cublasH = NULL;
cudaStream_t stream = NULL;

/*
 *   A = | 1.0 2.0 3.0 4.0 |
 */

double A[] = {1.0, 2.0, 3.0, 4.0};
const int incx = 1;

int result = 0.0;

data_type *d_A = NULL;

printf("A\n");
 /* step 1: create cublas handle, bind a stream */
CUBLAS_CHECK(cublasCreate(&cublasH));

CUDA_CHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
CUBLAS_CHECK(cublasSetStream(cublasH, stream));

/* step 2: copy data to device */
CUDA_CHECK(cudaMalloc(&d_A, sizeof(data_type) * 4));

CUDA_CHECK(cudaMemcpyAsync(d_A, A, sizeof(data_type) * 4, cudaMemcpyHostToDevice, stream));

/* step 3: compute */
CUBLAS_CHECK(cublasIdamax(cublasH, 4, d_A, incx, &result));

CUDA_CHECK(cudaStreamSynchronize(stream));

/*
 *   result = 1
 */

printf("result\n");
printf("%d\n", result);
printf("=====\n");

/* free resources */
CUDA_CHECK(cudaFree(d_A));

CUBLAS_CHECK(cublasDestroy(cublasH));

CUDA_CHECK(cudaStreamDestroy(stream));

CUDA_CHECK(cudaDeviceReset());

return EXIT_SUCCESS;

}

My environment:
Linux xxxx 5.15.146.1-microsoft-standard-WSL2 #1 SMP Thu Jan 11 04:09:03 UTC 2024 x86_64 x86_64 x86_64 GNU/Linux

±--------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.120 Driver Version: 537.58 CUDA Version: 12.2 |
|-----------------------------------------±---------------------±---------------------+
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|=========================================+======================+======================|
| 0 NVIDIA GeForce RTX 4060 … On | 00000000:01:00.0 Off | N/A |
| N/A 39C P0 16W / 60W | 0MiB / 8188MiB | 0% Default |
| | | N/A |
±----------------------------------------±---------------------±---------------------+

±--------------------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=======================================================================================|
| No running processes found |
±--------------------------------------------------------------------------------------+