Consistent "CUDA error: an illegal memory access was encountered" Error

Hello,

I’ve run into “CUDA error: an illegal memory access was encountered” multiple times regardless of using pytorch/ollama. I’ve tried to cut down my code to the simplest form that still produces this error. Here’s my code:

import torch

param = torch.randn((150000, 1000), dtype=torch.bfloat16, device='cuda:0')
param = param.to(torch.float16)
param = param.cpu()

print("success")

And the error is the following:

(torch) root@Htzr:~/code# python ./repro_error_2.py
Traceback (most recent call last):
  File "/root/code/./repro_error_2.py", line 5, in <module>
    param = param.cpu()
torch.AcceleratorError: CUDA error: an illegal memory access was encountered
Search for `cudaErrorIllegalAddress' in https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html for more information.
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.

When I run more complicated code in pytorch or deploy models with ollama, I run into this same error more or less. My cuda driver infos are the following:

(torch) root@Htzr:~/code# conda list
# packages in environment at /root/miniconda3/envs/torch:
#
# Name                      Version          Build               Channel
_libgcc_mutex               0.1              main
_openmp_mutex               5.1              1_gnu
accelerate                  1.12.0           pypi_0              pypi
bitsandbytes                0.49.0           pypi_0              pypi
bzip2                       1.0.8            h5eee18b_6
ca-certificates             2025.12.2        h06a4308_0
certifi                     2025.11.12       pypi_0              pypi
charset-normalizer          3.4.4            pypi_0              pypi
filelock                    3.20.0           pypi_0              pypi
fsspec                      2025.12.0        pypi_0              pypi
hf-xet                      1.2.0            pypi_0              pypi
huggingface-hub             0.36.0           pypi_0              pypi
idna                        3.11             pypi_0              pypi
jinja2                      3.1.6            pypi_0              pypi
ld_impl_linux-64            2.44             h153f514_2
libexpat                    2.7.3            h7354ed3_4
libffi                      3.4.4            h6a678d5_1
libgcc                      15.2.0           h69a1729_7
libgcc-ng                   15.2.0           h166f726_7
libgomp                     15.2.0           h4751f2c_7
libmpdec                    4.0.0            h5eee18b_0
libstdcxx                   15.2.0           h39759b7_7
libstdcxx-ng                15.2.0           hc03a8fd_7
libuuid                     1.41.5           h5eee18b_0
libxcb                      1.17.0           h9b100fa_0
libzlib                     1.3.1            hb25bd0a_0
markupsafe                  3.0.2            pypi_0              pypi
modelscope                  1.33.0           pypi_0              pypi
mpmath                      1.3.0            pypi_0              pypi
ncurses                     6.5              h7934f7d_0
networkx                    3.6.1            pypi_0              pypi
numpy                       2.3.5            pypi_0              pypi
nvidia-cublas-cu12          12.6.4.1         pypi_0              pypi
nvidia-cuda-cupti-cu12      12.6.80          pypi_0              pypi
nvidia-cuda-nvrtc-cu12      12.6.77          pypi_0              pypi
nvidia-cuda-runtime-cu12    12.6.77          pypi_0              pypi
nvidia-cudnn-cu12           9.10.2.21        pypi_0              pypi
nvidia-cufft-cu12           11.3.0.4         pypi_0              pypi
nvidia-cufile-cu12          1.11.1.6         pypi_0              pypi
nvidia-curand-cu12          10.3.7.77        pypi_0              pypi
nvidia-cusolver-cu12        11.7.1.2         pypi_0              pypi
nvidia-cusparse-cu12        12.5.4.2         pypi_0              pypi
nvidia-cusparselt-cu12      0.7.1            pypi_0              pypi
nvidia-nccl-cu12            2.27.5           pypi_0              pypi
nvidia-nvjitlink-cu12       12.6.85          pypi_0              pypi
nvidia-nvshmem-cu12         3.3.20           pypi_0              pypi
nvidia-nvtx-cu12            12.6.77          pypi_0              pypi
openssl                     3.0.18           hd6dcaed_0
packaging                   25.0             pypi_0              pypi
pillow                      12.0.0           pypi_0              pypi
pip                         25.3             pyhc872135_0
psutil                      7.2.1            pypi_0              pypi
pthread-stubs               0.3              h0ce48e5_1
python                      3.13.11          hcf712cf_100_cp313
python_abi                  3.13             3_cp313
pyyaml                      6.0.3            pypi_0              pypi
readline                    8.3              hc2a1206_0
regex                       2025.11.3        pypi_0              pypi
requests                    2.32.5           pypi_0              pypi
safetensors                 0.7.0            pypi_0              pypi
setuptools                  80.9.0           py313h06a4308_0
sqlite                      3.51.0           h2a70700_0
sympy                       1.14.0           pypi_0              pypi
tk                          8.6.15           h54e0aa7_0
tokenizers                  0.22.1           pypi_0              pypi
torch                       2.9.1+cu126      pypi_0              pypi
torchvision                 0.24.1+cu126     pypi_0              pypi
tqdm                        4.67.1           pypi_0              pypi
transformers                4.57.3           pypi_0              pypi
triton                      3.5.1            pypi_0              pypi
typing-extensions           4.15.0           pypi_0              pypi
tzdata                      2025b            h04d1e81_0
urllib3                     2.6.2            pypi_0              pypi
wheel                       0.45.1           py313h06a4308_0
xorg-libx11                 1.8.12           h9b100fa_1
xorg-libxau                 1.0.12           h9b100fa_0
xorg-libxdmcp               1.1.5            h9b100fa_0
xorg-xorgproto              2024.1           h5eee18b_1
xz                          5.6.4            h5eee18b_1
zlib                        1.3.1            hb25bd0a_0

==============================================================

I also tried running with `compute-sanitizer` and here is the output:

(torch) root@Htzr:~/code# CUDA_LAUNCH_BLOCKING=1 compute-sanitizer python ./repro_error_2.py
========= COMPUTE-SANITIZER
========= Error: Device not supported. Please refer to the "Supported Devices" section of the sanitizer documentation
========= 
========= Program hit cudaErrorIllegalAddress (error 700) due to "an illegal memory access was encountered" on CUDA API call to cudaLaunchKernel.
=========     Saved host backtrace up to driver entry point at error
=========     Host Frame: [0x16ccc1]
=========                in /usr/bin/libsanitizer-collection.so
=========     Host Frame: [0x115b60]
=========                in /usr/bin/libsanitizer-collection.so
=========     Host Frame: [0xc78a5]
=========                in /usr/bin/libsanitizer-collection.so
=========     Host Frame: [0xc7d85]
=========                in /usr/bin/libsanitizer-collection.so
=========     Host Frame: [0xd04bd]
=========                in /usr/bin/libsanitizer-collection.so
=========     Host Frame: [0x2bbdb]
=========                in /usr/bin/libsanitizer-public.so
=========     Host Frame: [0x369fc]
=========                in /usr/bin/libsanitizer-public.so
=========     Host Frame: [0x34f9e3]
=========                in /usr/lib/wsl/drivers/nvmd.inf_amd64_aa54f7a758543a0a/libcuda.so.1.1
=========     Host Frame:cudaLaunchKernel [0x75a2e]
=========                in /root/miniconda3/envs/torch/lib/python3.13/site-packages/torch/lib/../../nvidia/cuda_runtime/lib/libcudart.so.12
=========     Host Frame:void at::native::gpu_kernel_impl<__nv_hdl_wrapper_t<false, true, false, __nv_dl_tag<void (*)(at::TensorIteratorBase&), &at::native::direct_copy_kernel_cuda, 18u>, c10::Half (c10::Half)> >(at::TensorIteratorBase&, __nv_hdl_wrapper_t<false, true, false, __nv_dl_tag<void (*)(at::TensorIteratorBase&), &at::native::direct_copy_kernel_cuda, 18u>, c10::Half (c10::Half)> const&) [clone .isra.0] [0x1ba14dc]
=========                in /root/miniconda3/envs/torch/lib/python3.13/site-packages/torch/lib/libtorch_cuda.so
=========     Host Frame:at::native::copy_device_to_device(at::TensorIterator&, bool, bool) [0x1bcdeed]
=========                in /root/miniconda3/envs/torch/lib/python3.13/site-packages/torch/lib/libtorch_cuda.so
=========     Host Frame:at::native::copy_impl(at::Tensor&, at::Tensor const&, bool) [clone .isra.0] [0x1b84d43]
=========                in /root/miniconda3/envs/torch/lib/python3.13/site-packages/torch/lib/libtorch_cpu.so
=========     Host Frame:at::native::copy_(at::Tensor&, at::Tensor const&, bool) [0x1b869c7]
=========                in /root/miniconda3/envs/torch/lib/python3.13/site-packages/torch/lib/libtorch_cpu.so
=========     Host Frame:at::_ops::copy_::call(at::Tensor&, at::Tensor const&, bool) [0x2842d37]
=========                in /root/miniconda3/envs/torch/lib/python3.13/site-packages/torch/lib/libtorch_cpu.so
=========     Host Frame:at::native::_to_copy(at::Tensor const&, std::optional<c10::ScalarType>, std::optional<c10::Layout>, std::optional<c10::Device>, std::optional<bool>, bool, std::optional<c10::MemoryFormat>) [0x1e64a3f]
=========                in /root/miniconda3/envs/torch/lib/python3.13/site-packages/torch/lib/libtorch_cpu.so
=========     Host Frame:c10::impl::wrap_kernel_functor_unboxed_<c10::impl::detail::WrapFunctionIntoFunctor_<c10::CompileTimeFunctionPointer<at::Tensor (at::Tensor const&, std::optional<c10::ScalarType>, std::optional<c10::Layout>, std::optional<c10::Device>, std::optional<bool>, bool, std::optional<c10::MemoryFormat>), &at::(anonymous namespace)::(anonymous namespace)::wrapper_CompositeExplicitAutograd___to_copy>, at::Tensor, c10::guts::typelist::typelist<at::Tensor const&, std::optional<c10::ScalarType>, std::optional<c10::Layout>, std::optional<c10::Device>, std::optional<bool>, bool, std::optional<c10::MemoryFormat> > >, at::Tensor (at::Tensor const&, std::optional<c10::ScalarType>, std::optional<c10::Layout>, std::optional<c10::Device>, std::optional<bool>, bool, std::optional<c10::MemoryFormat>)>::call(c10::OperatorKernel*, c10::DispatchKeySet, at::Tensor const&, std::optional<c10::ScalarType>, std::optional<c10::Layout>, std::optional<c10::Device>, std::optional<bool>, bool, std::optional<c10::MemoryFormat>) [0x2c4f23c]
=========                in /root/miniconda3/envs/torch/lib/python3.13/site-packages/torch/lib/libtorch_cpu.so
=========     Host Frame:at::_ops::_to_copy::redispatch(c10::DispatchKeySet, at::Tensor const&, std::optional<c10::ScalarType>, std::optional<c10::Layout>, std::optional<c10::Device>, std::optional<bool>, bool, std::optional<c10::MemoryFormat>) [0x2318b82]
=========                in /root/miniconda3/envs/torch/lib/python3.13/site-packages/torch/lib/libtorch_cpu.so
=========     Host Frame:c10::impl::wrap_kernel_functor_unboxed_<c10::impl::detail::WrapFunctionIntoFunctor_<c10::CompileTimeFunctionPointer<at::Tensor (at::Tensor const&, std::optional<c10::ScalarType>, std::optional<c10::Layout>, std::optional<c10::Device>, std::optional<bool>, bool, std::optional<c10::MemoryFormat>), &at::(anonymous namespace)::_to_copy>, at::Tensor, c10::guts::typelist::typelist<at::Tensor const&, std::optional<c10::ScalarType>, std::optional<c10::Layout>, std::optional<c10::Device>, std::optional<bool>, bool, std::optional<c10::MemoryFormat> > >, at::Tensor (at::Tensor const&, std::optional<c10::ScalarType>, std::optional<c10::Layout>, std::optional<c10::Device>, std::optional<bool>, bool, std::optional<c10::MemoryFormat>)>::call(c10::OperatorKernel*, c10::DispatchKeySet, at::Tensor const&, std::optional<c10::ScalarType>, std::optional<c10::Layout>, std::optional<c10::Device>, std::optional<bool>, bool, std::optional<c10::MemoryFormat>) [0x2a16c2b]
=========                in /root/miniconda3/envs/torch/lib/python3.13/site-packages/torch/lib/libtorch_cpu.so
=========     Host Frame:at::_ops::_to_copy::redispatch(c10::DispatchKeySet, at::Tensor const&, std::optional<c10::ScalarType>, std::optional<c10::Layout>, std::optional<c10::Device>, std::optional<bool>, bool, std::optional<c10::MemoryFormat>) [0x2318b82]
=========                in /root/miniconda3/envs/torch/lib/python3.13/site-packages/torch/lib/libtorch_cpu.so
=========     Host Frame:torch::autograd::VariableType::(anonymous namespace)::_to_copy(c10::DispatchKeySet, at::Tensor const&, std::optional<c10::ScalarType>, std::optional<c10::Layout>, std::optional<c10::Device>, std::optional<bool>, bool, std::optional<c10::MemoryFormat>) [0x4b1614c]
=========                in /root/miniconda3/envs/torch/lib/python3.13/site-packages/torch/lib/libtorch_cpu.so
=========     Host Frame:c10::impl::wrap_kernel_functor_unboxed_<c10::impl::detail::WrapFunctionIntoFunctor_<c10::CompileTimeFunctionPointer<at::Tensor (c10::DispatchKeySet, at::Tensor const&, std::optional<c10::ScalarType>, std::optional<c10::Layout>, std::optional<c10::Device>, std::optional<bool>, bool, std::optional<c10::MemoryFormat>), &torch::autograd::VariableType::(anonymous namespace)::_to_copy>, at::Tensor, c10::guts::typelist::typelist<c10::DispatchKeySet, at::Tensor const&, std::optional<c10::ScalarType>, std::optional<c10::Layout>, std::optional<c10::Device>, std::optional<bool>, bool, std::optional<c10::MemoryFormat> > >, at::Tensor (c10::DispatchKeySet, at::Tensor const&, std::optional<c10::ScalarType>, std::optional<c10::Layout>, std::optional<c10::Device>, std::optional<bool>, bool, std::optional<c10::MemoryFormat>)>::call(c10::OperatorKernel*, c10::DispatchKeySet, at::Tensor const&, std::optional<c10::ScalarType>, std::optional<c10::Layout>, std::optional<c10::Device>, std::optional<bool>, bool, std::optional<c10::MemoryFormat>) [0x4b165af]
=========                in /root/miniconda3/envs/torch/lib/python3.13/site-packages/torch/lib/libtorch_cpu.so
=========     Host Frame:at::_ops::_to_copy::call(at::Tensor const&, std::optional<c10::ScalarType>, std::optional<c10::Layout>, std::optional<c10::Device>, std::optional<bool>, bool, std::optional<c10::MemoryFormat>) [0x23bab5d]
=========                in /root/miniconda3/envs/torch/lib/python3.13/site-packages/torch/lib/libtorch_cpu.so
=========     Host Frame:at::native::to(at::Tensor const&, c10::ScalarType, bool, bool, std::optional<c10::MemoryFormat>) [0x1e628c4]
=========                in /root/miniconda3/envs/torch/lib/python3.13/site-packages/torch/lib/libtorch_cpu.so
=========     Host Frame:c10::impl::wrap_kernel_functor_unboxed_<c10::impl::detail::WrapFunctionIntoFunctor_<c10::CompileTimeFunctionPointer<at::Tensor (at::Tensor const&, c10::ScalarType, bool, bool, std::optional<c10::MemoryFormat>), &at::(anonymous namespace)::(anonymous namespace)::wrapper_CompositeImplicitAutograd_dtype_to>, at::Tensor, c10::guts::typelist::typelist<at::Tensor const&, c10::ScalarType, bool, bool, std::optional<c10::MemoryFormat> > >, at::Tensor (at::Tensor const&, c10::ScalarType, bool, bool, std::optional<c10::MemoryFormat>)>::call(c10::OperatorKernel*, c10::DispatchKeySet, at::Tensor const&, c10::ScalarType, bool, bool, std::optional<c10::MemoryFormat>) [0x2d4eedf]
=========                in /root/miniconda3/envs/torch/lib/python3.13/site-packages/torch/lib/libtorch_cpu.so
=========     Host Frame:at::_ops::to_dtype::call(at::Tensor const&, c10::ScalarType, bool, bool, std::optional<c10::MemoryFormat>) [0x2555de2]
=========                in /root/miniconda3/envs/torch/lib/python3.13/site-packages/torch/lib/libtorch_cpu.so
=========     Host Frame:torch::autograd::dispatch_to(at::Tensor const&, c10::ScalarType, bool, bool, std::optional<c10::MemoryFormat>) [0x428467]
=========                in /root/miniconda3/envs/torch/lib/python3.13/site-packages/torch/lib/libtorch_python.so
=========     Host Frame:torch::autograd::THPVariable_to(_object*, _object*, _object*) [0x429d64]
=========                in /root/miniconda3/envs/torch/lib/python3.13/site-packages/torch/lib/libtorch_python.so
=========     Host Frame:/usr/local/src/conda/python-3.13.11/Objects/descrobject.c:360:method_vectorcall_VARARGS_KEYWORDS [0x1c895c]
=========                in /root/miniconda3/envs/torch/bin/python
=========     Host Frame:/usr/local/src/conda/python-3.13.11/Objects/call.c:327:PyObject_Vectorcall [0x1944ae]
=========                in /root/miniconda3/envs/torch/bin/python
=========     Host Frame:/usr/local/src/conda/python-3.13.11/Python/generated_cases.c.h:817:_PyEval_EvalFrameDefault [0x1a5dd2]
=========                in /root/miniconda3/envs/torch/bin/python
=========     Host Frame:/usr/local/src/conda/python-3.13.11/Python/ceval.c:604:PyEval_EvalCode [0x26938f]
=========                in /root/miniconda3/envs/torch/bin/python
=========     Host Frame:/usr/local/src/conda/python-3.13.11/Python/pythonrun.c:1382:run_eval_code_obj [0x2a79f3]
=========                in /root/miniconda3/envs/torch/bin/python
=========     Host Frame:/usr/local/src/conda/python-3.13.11/Python/pythonrun.c:1490:run_mod [0x2a4b4c]
=========                in /root/miniconda3/envs/torch/bin/python
=========     Host Frame:/usr/local/src/conda/python-3.13.11/Python/pythonrun.c:1295:pyrun_file [0x2a1926]
=========                in /root/miniconda3/envs/torch/bin/python
=========     Host Frame:/usr/local/src/conda/python-3.13.11/Python/pythonrun.c:517:_PyRun_SimpleFileObject [0x2a15a3]
=========                in /root/miniconda3/envs/torch/bin/python
=========     Host Frame:/usr/local/src/conda/python-3.13.11/Python/pythonrun.c:77:_PyRun_AnyFileObject [0x2a13bc]
=========                in /root/miniconda3/envs/torch/bin/python
=========     Host Frame:/usr/local/src/conda/python-3.13.11/Modules/main.c:775:Py_RunMain [0x29fd74]
=========                in /root/miniconda3/envs/torch/bin/python
=========     Host Frame:/usr/local/src/conda/python-3.13.11/Modules/main.c:830:Py_BytesMain [0x2548c7]
=========                in /root/miniconda3/envs/torch/bin/python
=========     Host Frame: [0x29d90]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame:__libc_start_main [0x29e40]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame:_start [0x253c8e]
=========                in /root/miniconda3/envs/torch/bin/python
========= 
========= Program hit cudaErrorIllegalAddress (error 700) due to "an illegal memory access was encountered" on CUDA API call to cudaGetLastError.
=========     Saved host backtrace up to driver entry point at error
=========     Host Frame: [0x16ccc1]
=========                in /usr/bin/libsanitizer-collection.so
=========     Host Frame: [0x115b60]
=========                in /usr/bin/libsanitizer-collection.so
=========     Host Frame: [0xc78a5]
=========                in /usr/bin/libsanitizer-collection.so
=========     Host Frame: [0xc7d85]
=========                in /usr/bin/libsanitizer-collection.so
=========     Host Frame: [0xd04bd]
=========                in /usr/bin/libsanitizer-collection.so
=========     Host Frame: [0x2bbdb]
=========                in /usr/bin/libsanitizer-public.so
=========     Host Frame: [0x369fc]
=========                in /usr/bin/libsanitizer-public.so
=========     Host Frame: [0x34f9e3]
=========                in /usr/lib/wsl/drivers/nvmd.inf_amd64_aa54f7a758543a0a/libcuda.so.1.1
=========     Host Frame:cudaGetLastError [0x4dd27]
=========                in /root/miniconda3/envs/torch/lib/python3.13/site-packages/torch/lib/../../nvidia/cuda_runtime/lib/libcudart.so.12
=========     Host Frame:void at::native::gpu_kernel_impl<__nv_hdl_wrapper_t<false, true, false, __nv_dl_tag<void (*)(at::TensorIteratorBase&), &at::native::direct_copy_kernel_cuda, 18u>, c10::Half (c10::Half)> >(at::TensorIteratorBase&, __nv_hdl_wrapper_t<false, true, false, __nv_dl_tag<void (*)(at::TensorIteratorBase&), &at::native::direct_copy_kernel_cuda, 18u>, c10::Half (c10::Half)> const&) [clone .isra.0] [0x1ba1031]
=========                in /root/miniconda3/envs/torch/lib/python3.13/site-packages/torch/lib/libtorch_cuda.so
=========     Host Frame:at::native::copy_device_to_device(at::TensorIterator&, bool, bool) [0x1bcdeed]
=========                in /root/miniconda3/envs/torch/lib/python3.13/site-packages/torch/lib/libtorch_cuda.so
=========     Host Frame:at::native::copy_impl(at::Tensor&, at::Tensor const&, bool) [clone .isra.0] [0x1b84d43]
=========                in /root/miniconda3/envs/torch/lib/python3.13/site-packages/torch/lib/libtorch_cpu.so
=========     Host Frame:at::native::copy_(at::Tensor&, at::Tensor const&, bool) [0x1b869c7]
=========                in /root/miniconda3/envs/torch/lib/python3.13/site-packages/torch/lib/libtorch_cpu.so
=========     Host Frame:at::_ops::copy_::call(at::Tensor&, at::Tensor const&, bool) [0x2842d37]
=========                in /root/miniconda3/envs/torch/lib/python3.13/site-packages/torch/lib/libtorch_cpu.so
=========     Host Frame:at::native::_to_copy(at::Tensor const&, std::optional<c10::ScalarType>, std::optional<c10::Layout>, std::optional<c10::Device>, std::optional<bool>, bool, std::optional<c10::MemoryFormat>) [0x1e64a3f]
=========                in /root/miniconda3/envs/torch/lib/python3.13/site-packages/torch/lib/libtorch_cpu.so
=========     Host Frame:c10::impl::wrap_kernel_functor_unboxed_<c10::impl::detail::WrapFunctionIntoFunctor_<c10::CompileTimeFunctionPointer<at::Tensor (at::Tensor const&, std::optional<c10::ScalarType>, std::optional<c10::Layout>, std::optional<c10::Device>, std::optional<bool>, bool, std::optional<c10::MemoryFormat>), &at::(anonymous namespace)::(anonymous namespace)::wrapper_CompositeExplicitAutograd___to_copy>, at::Tensor, c10::guts::typelist::typelist<at::Tensor const&, std::optional<c10::ScalarType>, std::optional<c10::Layout>, std::optional<c10::Device>, std::optional<bool>, bool, std::optional<c10::MemoryFormat> > >, at::Tensor (at::Tensor const&, std::optional<c10::ScalarType>, std::optional<c10::Layout>, std::optional<c10::Device>, std::optional<bool>, bool, std::optional<c10::MemoryFormat>)>::call(c10::OperatorKernel*, c10::DispatchKeySet, at::Tensor const&, std::optional<c10::ScalarType>, std::optional<c10::Layout>, std::optional<c10::Device>, std::optional<bool>, bool, std::optional<c10::MemoryFormat>) [0x2c4f23c]
=========                in /root/miniconda3/envs/torch/lib/python3.13/site-packages/torch/lib/libtorch_cpu.so
=========     Host Frame:at::_ops::_to_copy::redispatch(c10::DispatchKeySet, at::Tensor const&, std::optional<c10::ScalarType>, std::optional<c10::Layout>, std::optional<c10::Device>, std::optional<bool>, bool, std::optional<c10::MemoryFormat>) [0x2318b82]
=========                in /root/miniconda3/envs/torch/lib/python3.13/site-packages/torch/lib/libtorch_cpu.so
=========     Host Frame:c10::impl::wrap_kernel_functor_unboxed_<c10::impl::detail::WrapFunctionIntoFunctor_<c10::CompileTimeFunctionPointer<at::Tensor (at::Tensor const&, std::optional<c10::ScalarType>, std::optional<c10::Layout>, std::optional<c10::Device>, std::optional<bool>, bool, std::optional<c10::MemoryFormat>), &at::(anonymous namespace)::_to_copy>, at::Tensor, c10::guts::typelist::typelist<at::Tensor const&, std::optional<c10::ScalarType>, std::optional<c10::Layout>, std::optional<c10::Device>, std::optional<bool>, bool, std::optional<c10::MemoryFormat> > >, at::Tensor (at::Tensor const&, std::optional<c10::ScalarType>, std::optional<c10::Layout>, std::optional<c10::Device>, std::optional<bool>, bool, std::optional<c10::MemoryFormat>)>::call(c10::OperatorKernel*, c10::DispatchKeySet, at::Tensor const&, std::optional<c10::ScalarType>, std::optional<c10::Layout>, std::optional<c10::Device>, std::optional<bool>, bool, std::optional<c10::MemoryFormat>) [0x2a16c2b]
=========                in /root/miniconda3/envs/torch/lib/python3.13/site-packages/torch/lib/libtorch_cpu.so
=========     Host Frame:at::_ops::_to_copy::redispatch(c10::DispatchKeySet, at::Tensor const&, std::optional<c10::ScalarType>, std::optional<c10::Layout>, std::optional<c10::Device>, std::optional<bool>, bool, std::optional<c10::MemoryFormat>) [0x2318b82]
=========                in /root/miniconda3/envs/torch/lib/python3.13/site-packages/torch/lib/libtorch_cpu.so
=========     Host Frame:torch::autograd::VariableType::(anonymous namespace)::_to_copy(c10::DispatchKeySet, at::Tensor const&, std::optional<c10::ScalarType>, std::optional<c10::Layout>, std::optional<c10::Device>, std::optional<bool>, bool, std::optional<c10::MemoryFormat>) [0x4b1614c]
=========                in /root/miniconda3/envs/torch/lib/python3.13/site-packages/torch/lib/libtorch_cpu.so
=========     Host Frame:c10::impl::wrap_kernel_functor_unboxed_<c10::impl::detail::WrapFunctionIntoFunctor_<c10::CompileTimeFunctionPointer<at::Tensor (c10::DispatchKeySet, at::Tensor const&, std::optional<c10::ScalarType>, std::optional<c10::Layout>, std::optional<c10::Device>, std::optional<bool>, bool, std::optional<c10::MemoryFormat>), &torch::autograd::VariableType::(anonymous namespace)::_to_copy>, at::Tensor, c10::guts::typelist::typelist<c10::DispatchKeySet, at::Tensor const&, std::optional<c10::ScalarType>, std::optional<c10::Layout>, std::optional<c10::Device>, std::optional<bool>, bool, std::optional<c10::MemoryFormat> > >, at::Tensor (c10::DispatchKeySet, at::Tensor const&, std::optional<c10::ScalarType>, std::optional<c10::Layout>, std::optional<c10::Device>, std::optional<bool>, bool, std::optional<c10::MemoryFormat>)>::call(c10::OperatorKernel*, c10::DispatchKeySet, at::Tensor const&, std::optional<c10::ScalarType>, std::optional<c10::Layout>, std::optional<c10::Device>, std::optional<bool>, bool, std::optional<c10::MemoryFormat>) [0x4b165af]
=========                in /root/miniconda3/envs/torch/lib/python3.13/site-packages/torch/lib/libtorch_cpu.so
=========     Host Frame:at::_ops::_to_copy::call(at::Tensor const&, std::optional<c10::ScalarType>, std::optional<c10::Layout>, std::optional<c10::Device>, std::optional<bool>, bool, std::optional<c10::MemoryFormat>) [0x23bab5d]
=========                in /root/miniconda3/envs/torch/lib/python3.13/site-packages/torch/lib/libtorch_cpu.so
=========     Host Frame:at::native::to(at::Tensor const&, c10::ScalarType, bool, bool, std::optional<c10::MemoryFormat>) [0x1e628c4]
=========                in /root/miniconda3/envs/torch/lib/python3.13/site-packages/torch/lib/libtorch_cpu.so
=========     Host Frame:c10::impl::wrap_kernel_functor_unboxed_<c10::impl::detail::WrapFunctionIntoFunctor_<c10::CompileTimeFunctionPointer<at::Tensor (at::Tensor const&, c10::ScalarType, bool, bool, std::optional<c10::MemoryFormat>), &at::(anonymous namespace)::(anonymous namespace)::wrapper_CompositeImplicitAutograd_dtype_to>, at::Tensor, c10::guts::typelist::typelist<at::Tensor const&, c10::ScalarType, bool, bool, std::optional<c10::MemoryFormat> > >, at::Tensor (at::Tensor const&, c10::ScalarType, bool, bool, std::optional<c10::MemoryFormat>)>::call(c10::OperatorKernel*, c10::DispatchKeySet, at::Tensor const&, c10::ScalarType, bool, bool, std::optional<c10::MemoryFormat>) [0x2d4eedf]
=========                in /root/miniconda3/envs/torch/lib/python3.13/site-packages/torch/lib/libtorch_cpu.so
=========     Host Frame:at::_ops::to_dtype::call(at::Tensor const&, c10::ScalarType, bool, bool, std::optional<c10::MemoryFormat>) [0x2555de2]
=========                in /root/miniconda3/envs/torch/lib/python3.13/site-packages/torch/lib/libtorch_cpu.so
=========     Host Frame:torch::autograd::dispatch_to(at::Tensor const&, c10::ScalarType, bool, bool, std::optional<c10::MemoryFormat>) [0x428467]
=========                in /root/miniconda3/envs/torch/lib/python3.13/site-packages/torch/lib/libtorch_python.so
=========     Host Frame:torch::autograd::THPVariable_to(_object*, _object*, _object*) [0x429d64]
=========                in /root/miniconda3/envs/torch/lib/python3.13/site-packages/torch/lib/libtorch_python.so
=========     Host Frame:/usr/local/src/conda/python-3.13.11/Objects/descrobject.c:360:method_vectorcall_VARARGS_KEYWORDS [0x1c895c]
=========                in /root/miniconda3/envs/torch/bin/python
=========     Host Frame:/usr/local/src/conda/python-3.13.11/Objects/call.c:327:PyObject_Vectorcall [0x1944ae]
=========                in /root/miniconda3/envs/torch/bin/python
=========     Host Frame:/usr/local/src/conda/python-3.13.11/Python/generated_cases.c.h:817:_PyEval_EvalFrameDefault [0x1a5dd2]
=========                in /root/miniconda3/envs/torch/bin/python
=========     Host Frame:/usr/local/src/conda/python-3.13.11/Python/ceval.c:604:PyEval_EvalCode [0x26938f]
=========                in /root/miniconda3/envs/torch/bin/python
=========     Host Frame:/usr/local/src/conda/python-3.13.11/Python/pythonrun.c:1382:run_eval_code_obj [0x2a79f3]
=========                in /root/miniconda3/envs/torch/bin/python
=========     Host Frame:/usr/local/src/conda/python-3.13.11/Python/pythonrun.c:1490:run_mod [0x2a4b4c]
=========                in /root/miniconda3/envs/torch/bin/python
=========     Host Frame:/usr/local/src/conda/python-3.13.11/Python/pythonrun.c:1295:pyrun_file [0x2a1926]
=========                in /root/miniconda3/envs/torch/bin/python
=========     Host Frame:/usr/local/src/conda/python-3.13.11/Python/pythonrun.c:517:_PyRun_SimpleFileObject [0x2a15a3]
=========                in /root/miniconda3/envs/torch/bin/python
=========     Host Frame:/usr/local/src/conda/python-3.13.11/Python/pythonrun.c:77:_PyRun_AnyFileObject [0x2a13bc]
=========                in /root/miniconda3/envs/torch/bin/python
=========     Host Frame:/usr/local/src/conda/python-3.13.11/Modules/main.c:775:Py_RunMain [0x29fd74]
=========                in /root/miniconda3/envs/torch/bin/python
=========     Host Frame:/usr/local/src/conda/python-3.13.11/Modules/main.c:830:Py_BytesMain [0x2548c7]
=========                in /root/miniconda3/envs/torch/bin/python
=========     Host Frame: [0x29d90]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame:__libc_start_main [0x29e40]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame:_start [0x253c8e]
=========                in /root/miniconda3/envs/torch/bin/python
========= 
Traceback (most recent call last):
  File "/root/code/./repro_error_2.py", line 4, in <module>
    param = param.to(torch.float16)
torch.AcceleratorError: CUDA error: an illegal memory access was encountered
Search for `cudaErrorIllegalAddress' in https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html for more information.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.

========= Target application returned an error
========= ERROR SUMMARY: 3 errors

Any idea? Any help would be much appreciated!

Hi, @yinqb

This is forum to support devtools compute-sanitizer.
Do you think anything wrong with compute-sanitizer ?

I see your env is 12.6, and pytorch run failed directly. So are you using the right pytorch that compatible with CUDA 12.6 ?

Any further question about pytorch, please ask in deep learning or cuda programming related forum. Thanks !

Hello,

thanks for your reply! I guess the issue I’m running into is the CUDA 900 error “CUDA error: an illegal memory access was encountered“. I was only using compute-sanitizer to try to debug it. Which forum should I redirect this to then? thanks.

Posted here: cudaErrorIllegalAddress Encountered: “CUDA error: an illegal memory access was encountered” - CUDA / CUDA Programming and Performance - NVIDIA Developer Forums

Can you try upgrading to driver r590, and compute-sanitizer from CUDA 13.1? Also, can you try adding explicit synchronizations to isolate the issue? Like:

import torch

param = torch.randn((150000, 1000), dtype=torch.bfloat16, device='cuda:0')
torch.cuda.synchronize() # does it fail here?

param = param.to(torch.float16)
torch.cuda.synchronize() # or here?

param = param.cpu() # or only here?

My guess would be that it fails on torch.randn, which could potentially be due to a Pytorch bug.

I’ve done the updates and here’s the screenshots:

The driver is 591.59 now and CUDA is 13.1 now.

Here’s the output for the code after adding `torch.cuda.synchronize()`

(torch131) root@Htzr:~/code# compute-sanitizer python repro_error.py
========= COMPUTE-SANITIZER
========= Program hit cudaErrorUnknown (error 999) due to "unknown error" on CUDA API call to cudaDeviceSynchronize.
=========
========= Program hit cudaErrorUnknown (error 999) due to "unknown error" on CUDA API call to cudaGetLastError.
=========
Traceback (most recent call last):
  File "/root/code/repro_error.py", line 4, in <module>
    torch.cuda.synchronize()
    ~~~~~~~~~~~~~~~~~~~~~~^^
  File "/root/miniconda3/envs/torch131/lib/python3.14/site-packages/torch/cuda/__init__.py", line 1083, in synchronize
    return torch._C._cuda_synchronize()
           ~~~~~~~~~~~~~~~~~~~~~~~~~~^^
torch.AcceleratorError: CUDA error: unknown error
Search for `cudaErrorUnknown' in https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html for more information.
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.

========= Invalid  atomic of size 256 bytes
=========     at void at::native::<unnamed>::distribution_elementwise_grid_stride_kernel<float, (int)4, void at::native::templates::cuda::normal_and_transform<c10::BFloat16, float, at::CUDAGeneratorImpl *, void at::native::templates::cuda::normal_kernel<at::CUDAGeneratorImpl *>(const at::TensorBase &, double, double, T1)::[lambda() (instance 1)]::operator ()() const::[lambda() (instance 4)]::operator ()() const::[lambda(float) (instance 1)]>(at::TensorIteratorBase &, T3, T4)::[lambda(curandStatePhilox4_32_10 *) (instance 2)], void at::native::<unnamed>::distribution_nullary_kernel<c10::BFloat16, float, float4, at::CUDAGeneratorImpl *, void at::native::templates::cuda::normal_and_transform<c10::BFloat16, float, at::CUDAGeneratorImpl *, void at::native::templates::cuda::normal_kernel<at::CUDAGeneratorImpl *>(const at::TensorBase &, double, double, T1)::[lambda() (instance 1)]::operator ()() const::[lambda() (instance 4)]::operator ()() const::[lambda(float) (instance 1)]>(at::TensorIteratorBase &, T3, T4)::[lambda(curandStatePhilox4_32_10 *) (instance 2)], void at::native::templates::cuda::normal_kernel<at::CUDAGeneratorImpl *>(const at::TensorBase &, double, double, T1)::[lambda() (instance 1)]::operator ()() const::[lambda() (instance 4)]::operator ()() const::[lambda(float) (instance 1)]>(at::TensorIteratorBase &, T4, const T5 &, T6)::[lambda(int, float) (instance 1)]>(long, at::PhiloxCudaState, T3, T4)+0x1460
=========     by thread (31,0,0) in block (138,0,0)
=========     Access to 0x71ca1153e is out of bounds
=========
========= Error in printing record
========= Stack overflow
=========
========= Invalid  atomic of size 256 bytes
=========     at void at::native::<unnamed>::distribution_elementwise_grid_stride_kernel<float, (int)4, void at::native::templates::cuda::normal_and_transform<c10::BFloat16, float, at::CUDAGeneratorImpl *, void at::native::templates::cuda::normal_kernel<at::CUDAGeneratorImpl *>(const at::TensorBase &, double, double, T1)::[lambda() (instance 1)]::operator ()() const::[lambda() (instance 4)]::operator ()() const::[lambda(float) (instance 1)]>(at::TensorIteratorBase &, T3, T4)::[lambda(curandStatePhilox4_32_10 *) (instance 2)], void at::native::<unnamed>::distribution_nullary_kernel<c10::BFloat16, float, float4, at::CUDAGeneratorImpl *, void at::native::templates::cuda::normal_and_transform<c10::BFloat16, float, at::CUDAGeneratorImpl *, void at::native::templates::cuda::normal_kernel<at::CUDAGeneratorImpl *>(const at::TensorBase &, double, double, T1)::[lambda() (instance 1)]::operator ()() const::[lambda() (instance 4)]::operator ()() const::[lambda(float) (instance 1)]>(at::TensorIteratorBase &, T3, T4)::[lambda(curandStatePhilox4_32_10 *) (instance 2)], void at::native::templates::cuda::normal_kernel<at::CUDAGeneratorImpl *>(const at::TensorBase &, double, double, T1)::[lambda() (instance 1)]::operator ()() const::[lambda() (instance 4)]::operator ()() const::[lambda(float) (instance 1)]>(at::TensorIteratorBase &, T4, const T5 &, T6)::[lambda(int, float) (instance 1)]>(long, at::PhiloxCudaState, T3, T4)+0x17c0
=========     by thread (31,0,0) in block (138,0,0)
=========     Access to 0x71cb3153e is out of bounds
=========
========= Error in printing record
terminate called after throwing an instance of 'std::length_error'
  what():  cannot create std::vector larger than max_size()
========= Error: process didn't terminate successfully
========= Target application returned an error
========= ERROR SUMMARY: 5 errors

Seems like it’s still failing. My pytorch version for current cuda is 2.9.1+cu130.

I tried a cuda c++ script and got the same error.

#include <iostream>
#include <cstdlib>
#include <ctime>


__global__ void vectorAdd(const float* A, const float* B, float* C, int n) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < n) {
        C[i] = A[i] + B[i];
    }
}

int main() {
    const int n = 1024 * 1024;  // 100万元素(兼顾性能和测试速度)
    size_t size = n * sizeof(float); // 这里必须足够大,对显存使用够多的时候会报错

    float* h_A = (float*)malloc(size);
    float* h_B = (float*)malloc(size);
    float* h_C = (float*)malloc(size);

    srand(time(0));  // 随机数种子
    for (int i = 0; i < n; ++i) {
        h_A[i] = static_cast<float>(rand()) / RAND_MAX;  // 0~1之间的随机数
        h_B[i] = static_cast<float>(rand()) / RAND_MAX;
    }

    float* d_A = nullptr;
    float* d_B = nullptr;
    float* d_C = nullptr;
    cudaMalloc((void**)&d_A, size);
    cudaMalloc((void**)&d_B, size);
    cudaMalloc((void**)&d_C, size);

    cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);

    int blockSize = 256;  // 每个线程块的线程数(经典值,可调整)
    int gridSize = (n + blockSize - 1) / blockSize;
    std::cout << "Launching kernel with gridSize: " << gridSize 
              << ", blockSize: " << blockSize << std::endl;
    vectorAdd<<<gridSize, blockSize>>>(d_A, d_B, d_C, n);
    cudaGetLastError();
    cudaDeviceSynchronize();

    cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);

    free(h_A);
    free(h_B);
    free(h_C);
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);

    cudaDeviceReset();

    return 0;
}

image

Problem solved. I contacted customer support and changed another 4090.