Next Problem training in FP16:
RuntimeError: Unable to find a valid cuDNN algorithm to run convolution
in response to:
import torch.config
print(torch.config.show()
prints:
pyTorch built with:
- GCC 7.5
- C++ Version: 201402
- OpenMP 201511 (a.k.a. OpenMP 4.5)
- NNPACK is enabled
- CPU capability usage: NO AVX
- CUDA Runtime 10.2
- NVCC architecture flags: -gencode;arch=compute_53,code=sm_53;-gencode;arch=compute_62,code=sm_62;-gencode;arch=compute_72,code=sm_72
- CuDNN 8.0
- Magma 2.5.3
The error detail is:
~/envs/fastai2/lib/python3.6/site-packages/fastai2/learner.py in all_batches(self)
151 def all_batches(self):
152 self.n_iter = len(self.dl)
--> 153 for o in enumerate(self.dl): self.one_batch(*o)
154
155 def one_batch(self, i, b):
~/envs/fastai2/lib/python3.6/site-packages/fastai2/learner.py in one_batch(self, i, b)
161 self.loss = self.loss_func(self.pred, *self.yb); self('after_loss')
162 if not self.training: return
--> 163 self.loss.backward(); self('after_backward')
164 self.opt.step(); self('after_step')
165 self.opt.zero_grad()
~/envs/fastai2/lib/python3.6/site-packages/torch/tensor.py in backward(self, gradient, retain_graph, create_graph)
182 products. Defaults to ``False``.
183 """
--> 184 torch.autograd.backward(self, gradient, retain_graph, create_graph)
185
186 def register_hook(self, hook):
~/envs/fastai2/lib/python3.6/site-packages/torch/autograd/__init__.py in backward(tensors, grad_tensors, retain_graph, create_graph, grad_variables)
123 Variable._execution_engine.run_backward(
124 tensors, grad_tensors, retain_graph, create_graph,
--> 125 allow_unreachable=True) # allow_unreachable flag
126
RuntimeError: Unable to find a valid cuDNN algorithm to run convolution
Exception raised from try_all at ../aten/src/ATen/native/cudnn/Conv.cpp:692 (most recent call first):
frame #0: c10::Error::Error(c10::SourceLocation, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >) + 0xa0 (0x7f9817f3f0 in /home/bart/envs/fastai2/lib/python3.6/site-packages/torch/lib/libc10.so)
frame #1: <unknown function> + 0x2c00a6c (0x7f349c7a6c in /home/bart/envs/fastai2/lib/python3.6/site-packages/torch/lib/libtorch_cuda.so)
frame #2: <unknown function> + 0x2bf326c (0x7f349ba26c in /home/bart/envs/fastai2/lib/python3.6/site-packages/torch/lib/libtorch_cuda.so)
frame #3: <unknown function> + 0x2bf4280 (0x7f349bb280 in /home/bart/envs/fastai2/lib/python3.6/site-packages/torch/lib/libtorch_cuda.so)
frame #4: <unknown function> + 0x2bf7d04 (0x7f349bed04 in /home/bart/envs/fastai2/lib/python3.6/site-packages/torch/lib/libtorch_cuda.so)
frame #5: at::native::cudnn_convolution_backward_weight(c10::ArrayRef<long>, at::Tensor const&, at::Tensor const&, c10::ArrayRef<long>, c10::ArrayRef<long>, c10::ArrayRef<long>, long, bool, bool) + 0x70 (0x7f349bef70 in /home/bart/envs/fastai2/lib/python3.6/site-packages/torch/lib/libtorch_cuda.so)
frame #6: <unknown function> + 0x2c58b40 (0x7f34a1fb40 in /home/bart/envs/fastai2/lib/python3.6/site-packages/torch/lib/libtorch_cuda.so)
frame #7: <unknown function> + 0x2cc58bc (0x7f34a8c8bc in /home/bart/envs/fastai2/lib/python3.6/site-packages/torch/lib/libtorch_cuda.so)
frame #8: at::native::cudnn_convolution_backward(at::Tensor const&, at::Tensor const&, at::Tensor const&, c10::ArrayRef<long>, c10::ArrayRef<long>, c10::ArrayRef<long>, long, bool, bool, std::array<bool, 2ul>) + 0x288 (0x7f349bf990 in /home/bart/envs/fastai2/lib/python3.6/site-packages/torch/lib/libtorch_cuda.so)
frame #9: <unknown function> + 0x2c587a8 (0x7f34a1f7a8 in /home/bart/envs/fastai2/lib/python3.6/site-packages/torch/lib/libtorch_cuda.so)
frame #10: <unknown function> + 0x2cc595c (0x7f34a8c95c in /home/bart/envs/fastai2/lib/python3.6/site-packages/torch/lib/libtorch_cuda.so)
frame #11: <unknown function> + 0x1e8a91c (0x7f6398491c in /home/bart/envs/fastai2/lib/python3.6/site-packages/torch/lib/libtorch_cpu.so)
frame #12: <unknown function> + 0x1e8bdfc (0x7f63985dfc in /home/bart/envs/fastai2/lib/python3.6/site-packages/torch/lib/libtorch_cpu.so)
frame #13: torch::autograd::generated::CudnnConvolutionBackward::apply(std::vector<at::Tensor, std::allocator<at::Tensor> >&&) + 0x298 (0x7f6373fb98 in /home/bart/envs/fastai2/lib/python3.6/site-packages/torch/lib/libtorch_cpu.so)
frame #14: <unknown function> + 0x212f988 (0x7f63c29988 in /home/bart/envs/fastai2/lib/python3.6/site-packages/torch/lib/libtorch_cpu.so)
frame #15: torch::autograd::Engine::evaluate_function(std::shared_ptr<torch::autograd::GraphTask>&, torch::autograd::Node*, torch::autograd::InputBuffer&, std::shared_ptr<torch::autograd::ReadyQueue> const&) + 0x10b0 (0x7f63c24ae0 in /home/bart/envs/fastai2/lib/python3.6/site-packages/torch/lib/libtorch_cpu.so)
frame #16: torch::autograd::Engine::thread_main(std::shared_ptr<torch::autograd::GraphTask> const&, bool) + 0x424 (0x7f63c25644 in /home/bart/envs/fastai2/lib/python3.6/site-packages/torch/lib/libtorch_cpu.so)
frame #17: torch::autograd::Engine::thread_init(int, std::shared_ptr<torch::autograd::ReadyQueue> const&) + 0xa0 (0x7f63c1cdf0 in /home/bart/envs/fastai2/lib/python3.6/site-packages/torch/lib/libtorch_cpu.so)
frame #18: torch::autograd::python::PythonEngine::thread_init(int, std::shared_ptr<torch::autograd::ReadyQueue> const&) + 0x60 (0x7f906ba118 in /home/bart/envs/fastai2/lib/python3.6/site-packages/torch/lib/libtorch_python.so)
frame #19: <unknown function> + 0xbbe94 (0x7f99a0ce94 in /usr/lib/aarch64-linux-gnu/libstdc++.so.6)
frame #20: <unknown function> + 0x7088 (0x7f9bb7c088 in /lib/aarch64-linux-gnu/libpthread.so.0)
Any thougths as to why cuDNN 8.0 canât do the (Iâm guessing) 2d convolution?