I am new to CUDA programming. I am doing some experiments with the cuSOLVER Lapack APIs. One sample program calls cusolverDnSgetrf() and cusolverDnSgetrs() in an infinite loop. This code runs fine on A10. However, on A100 (A100-SXM4-40GB) it runs fine for a few iterations and then the cudaDeviceSynchronize() called after cusolverDnSgetrf() fails with error code 700 (cudaErrorIllegalAddress).
-
For matrix size 10000, the failure happens in the 72nd iteration.
-
For matrix size 20000, the failure happens in the 49th iteration.
-
For matrix size 30000, the failure happens in the 42nd iteration.
-
For matrix size 40000, the failure happens in the 39th iteration.
-
Ran matrix size 100 for ~50000 iterations. No failures.
-
Ran matrix size 1000 for ~5000 iterations. No failures.
-
Ran matrix size 5000 for ~1000 iterations. No failures.
Tried cuda-memcheck on the A100. There were 112 errors, all saying “Invalid global read of size 4” in getrf3_swap_alg1(). Some of the error messages are given below.
Since I use an infinite loop, nothing was reported in A10 where the program has no failures. I changed the code to execute for only 100 iterations. cuda-memcheck did not report any errors on A10.
The code looks something like this
cuda_stat = cudaMallocManaged(&a, (n * n * sizeof(*a)));
CUDA_CHECK(cuda_stat, "cudaMallocManaged");
cuda_stat = cudaMallocManaged(&b, (n * sizeof (*b)));
CUDA_CHECK(cuda_stat, "cudaMallocManaged");
cuda_stat = cudaMallocManaged(&x, (n * sizeof (*x)));
CUDA_CHECK(cuda_stat, "cudaMallocManaged");
cuda_stat = cudaMallocManaged(&pivot, (n * sizeof (*pivot)));
CUDA_CHECK(cuda_stat, "cudaMallocManaged");
cuda_stat = cudaMallocManaged(&info, sizeof (*info));
CUDA_CHECK(cuda_stat, "cudaMallocManaged");
cusolver_stat = cusolverDnCreate(&handle);
if (cusolver_stat != CUSOLVER_STATUS_SUCCESS) {
printf ("cuSOLVER initialization failed with error code %d\n", cusolver_stat);
exit(-1);
}
. . .
. . .
cusolver_stat = cusolverDnSgetrf_bufferSize(handle, n, n, a, lda, &wspace_size);
cuda_stat = cudaDeviceSynchronize();
CUDA_CHECK(cuda_stat, "cudaDeviceSynchronize");
if (cusolver_stat != CUSOLVER_STATUS_SUCCESS) {
printf ("cuSOLVER dense getrf_bufferSize failed with error code %d\n", cusolver_stat);
exit(-1);
}
cuda_stat = cudaMallocManaged(&wspace, (wspace_size * sizeof (*wspace)));
CUDA_CHECK(cuda_stat, "cudaMallocManaged");
i = 1;
while (1) {
printf("loop=%d\n", i); fflush(NULL);i++;
cusolver_stat = cusolverDnSgetrf(handle, n, n, a, lda, wspace, pivot, info);
cuda_stat = cudaDeviceSynchronize();
CUDA_CHECK(cuda_stat, "cudaDeviceSynchronize");
if (cusolver_stat != CUSOLVER_STATUS_SUCCESS) {
printf ("cuSOLVER dense getrf failed with error code %d\n", cusolver_stat);
exit(-1);
}
if (*info != 0) {
printf ("cuSOLVER dense getrf finished but set error info to %d\n", *info);
exit(-1);
}
cublasOperation_t trans = CUBLAS_OP_N;
int nrhs = 1;
int ldb = n;
cusolver_stat = cusolverDnSgetrs(handle, trans, n, nrhs, a, lda, pivot, b, ldb, info);
cuda_stat = cudaDeviceSynchronize();
CUDA_CHECK(cuda_stat, "cudaDeviceSynchronize");
if (cusolver_stat != CUSOLVER_STATUS_SUCCESS) {
printf ("cuSOLVER dense getrs failed with error code %d\n", cusolver_stat);
exit(-1);
}
if (*info != 0) {
printf ("cuSOLVER dense getrs finished but set error info to %d\n", *info);
exit(-1);
}
} /* while (1) */
The input matrices are initialized only once, before the infinite loop. So after the first iteration, whatever is inside the matrices are passed as an input in the subsequent iterations. This is not a problem for my program as it does not do any data validations. I use this program to see its effect on the GPU power.
I thought that perhaps I needed cudaDeviceSynchronize() after the calls to cudaMallocManaged(). So I added it in two places : once after the first set of cudaMallocManaged() calls and again after the cudaMallocManaged() for wspace. No change in behaviour.
All 112 cuda-memcheck errors on A100 seems to happen in the same code location. Given below are 3 errors :
========= Invalid __global__ read of size 4
========= at 0x000007f0 in void getrf3_swap_alg1<float, int=256>(int, int, int, float*, int, int const *, int, int)
========= by thread (15,0,0) in block (39,0,0)
========= Address 0x7f9089d8203c is out of bounds
========= Device Frame:void getrf3_swap_alg1<float, int=256>(int, int, int, float*, int, int const *, int, int) (void getrf3_swap_alg1<float, int=256>(int, int, int, float*, int, int const *, int, int) : 0x7f0)
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame:/lib64/libcuda.so [0x20d6ea]
========= Host Frame:./nvgpu_lapack [0x20af0b]
========= Host Frame:./nvgpu_lapack [0x261548]
========= Host Frame:./nvgpu_lapack [0x5f524]
========= Host Frame:./nvgpu_lapack [0x611d1]
========= Host Frame:./nvgpu_lapack [0x52fb3]
========= Host Frame:./nvgpu_lapack [0x4bc30]
========= Host Frame:./nvgpu_lapack [0x4d141]
========= Host Frame:./nvgpu_lapack [0xd64f]
========= Host Frame:/lib64/libc.so.6 (__libc_start_main + 0xf5) [0x22555]
========= Host Frame:./nvgpu_lapack [0x4b2b9]
=========
========= Invalid __global__ read of size 4
========= at 0x000007f0 in void getrf3_swap_alg1<float, int=256>(int, int, int, float*, int, int const *, int, int)
========= by thread (14,0,0) in block (39,0,0)
========= Address 0x7f9089d82038 is out of bounds
========= Device Frame:void getrf3_swap_alg1<float, int=256>(int, int, int, float*, int, int const *, int, int) (void getrf3_swap_alg1<float, int=256>(int, int, int, float*, int, int const *, int, int) : 0x7f0)
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame:/lib64/libcuda.so [0x20d6ea]
========= Host Frame:./nvgpu_lapack [0x20af0b]
========= Host Frame:./nvgpu_lapack [0x261548]
========= Host Frame:./nvgpu_lapack [0x5f524]
========= Host Frame:./nvgpu_lapack [0x611d1]
========= Host Frame:./nvgpu_lapack [0x52fb3]
========= Host Frame:./nvgpu_lapack [0x4bc30]
========= Host Frame:./nvgpu_lapack [0x4d141]
========= Host Frame:./nvgpu_lapack [0xd64f]
========= Host Frame:/lib64/libc.so.6 (__libc_start_main + 0xf5) [0x22555]
========= Host Frame:./nvgpu_lapack [0x4b2b9]
=========
========= Invalid __global__ read of size 4
========= at 0x000007f0 in void getrf3_swap_alg1<float, int=256>(int, int, int, float*, int, int const *, int, int)
========= by thread (13,0,0) in block (39,0,0)
========= Address 0x7f9089d82034 is out of bounds
========= Device Frame:void getrf3_swap_alg1<float, int=256>(int, int, int, float*, int, int const *, int, int) (void getrf3_swap_alg1<float, int=256>(int, int, int, float*, int, int const *, int, int) : 0x7f0)
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame:/lib64/libcuda.so [0x20d6ea]
========= Host Frame:./nvgpu_lapack [0x20af0b]
========= Host Frame:./nvgpu_lapack [0x261548]
========= Host Frame:./nvgpu_lapack [0x5f524]
========= Host Frame:./nvgpu_lapack [0x611d1]
========= Host Frame:./nvgpu_lapack [0x52fb3]
========= Host Frame:./nvgpu_lapack [0x4bc30]
========= Host Frame:./nvgpu_lapack [0x4d141]
========= Host Frame:./nvgpu_lapack [0xd64f]
========= Host Frame:/lib64/libc.so.6 (__libc_start_main + 0xf5) [0x22555]
========= Host Frame:./nvgpu_lapack [0x4b2b9]
=========
I would like to understand
- what is causing the failure on A100 (A100-SXM4-40GB). The addresses
passed to the APIs do not change between iterations - why this program runs fine on A10 but fails on A100
- why it is able to run fine for some time on A100 before the failure
- why it does not (seem to) fail on A100 for matrix sizes smaller
than 10000.
Thanks