Nvidia DIGITS crashing when training

We are using the folowing dockerfile for creating a DIGITS implementation:

FROM ubuntu:14.04 as protobuf
RUN echo "deb http://security.ubuntu.com/ubuntu trusty-security main" >> /etc/apt/sources.list
RUN echo "deb http://cz.archive.ubuntu.com/ubuntu trusty main universe" >> /etc/apt/sources.list

RUN apt update && apt install -y software-properties-common && rm -rf /var/lib/apt/lists/*

RUN add-apt-repository ppa:fkrull/deadsnakes-python2.7

RUN apt-get update && apt-get install -y --force-yes --no-install-recommends \
        autoconf \
        automake \
        ca-certificates \
        curl \
        g++ \
        git \
        libtool \
        make \
        python-dev \
        python-setuptools \
        unzip && \
    rm -rf /var/lib/apt/lists/*


WORKDIR /protobuf
RUN git clone -b '3.2.x' https://github.com/google/protobuf.git . && \
    ./autogen.sh && \
    ./configure --prefix=/usr/local/protobuf && \
    make "-j$(nproc)" install


FROM nvidia/cuda:8.0-cudnn7-devel-ubuntu14.04 as caffe
RUN rm /etc/apt/sources.list.d/* && apt-get clean && apt-get update
RUN echo "deb http://security.ubuntu.com/ubuntu trusty-security main" >> /etc/apt/sources.list
RUN echo "deb http://cz.archive.ubuntu.com/ubuntu trusty main universe" >> /etc/apt/sources.list

COPY --from=protobuf /usr/local/protobuf /usr/local

RUN apt update && apt install -y software-properties-common && rm -rf /var/lib/apt/lists/*
RUN add-apt-repository ppa:fkrull/deadsnakes-python2.7

RUN apt-get update && apt-get install -y --force-yes --no-install-recommends \
        autoconf \
        automake \
        ca-certificates \
        curl \
        g++ \
        git \
        libtool \
        make \
        python-dev \
        python-setuptools \
        unzip && \
    rm -rf /var/lib/apt/lists/*

WORKDIR /
RUN curl -O https://bootstrap.pypa.io/pip/2.7/get-pip.py && \
    python get-pip.py && \
    pip install --upgrade --no-cache-dir pip

RUN pip install urllib3[secure]

RUN apt update \
    && apt install -y --force-yes --no-install-recommends build-essential cmake git gfortran libbz2-dev libatlas-base-dev libgflags-dev libgoogle-glog-dev libhdf5-serial-dev libleveldb-dev liblmdb-dev libopencv-dev libsnappy-dev libturbojpeg libjpeg-turbo8-dev libopenblas-dev python-pip python-all-dev python-dev python-h5py python-numpy python-opencv python-pil python-pip python-pydot python-scipy python-skimage python-sklearn libatlas-base-dev libboost-filesystem1.55-dev libboost-python1.55-dev libboost-system1.55-dev libboost-thread1.55-dev libboost-regex1.55-dev \
    && rm -rf /var/lib/apt/lists/*

RUN pip install matplotlib~=1.5.2 networkx~=1.8.1 cython pandas~=0.12.0 pyyaml~=3.10 traitlets~=4.2

#install cmake
ADD https://cmake.org/files/v3.7/cmake-3.7.2-Linux-x86_64.sh /cmake-3.7.2-Linux-x86_64.sh
RUN mkdir /opt/cmake
RUN sh /cmake-3.7.2-Linux-x86_64.sh --prefix=/opt/cmake --skip-license
RUN ln -s /opt/cmake/bin/cmake /usr/local/bin/cmake
RUN cmake --version

ADD https://github.com/NVIDIA/nccl/releases/download/v1.2.3-1%2Bcuda8.0/libnccl1_1.2.3-1.cuda8.0_amd64.deb /tmp/libnccl1_1.2.3-1.cuda8.0_amd64.deb
ADD https://github.com/NVIDIA/nccl/releases/download/v1.2.3-1%2Bcuda8.0/libnccl-dev_1.2.3-1.cuda8.0_amd64.deb /tmp/libnccl-dev_1.2.3-1.cuda8.0_amd64.deb 

WORKDIR /tmp
RUN dpkg -i ./libnccl1_1.2.3-1.cuda8.0_amd64.deb
RUN dpkg -i ./libnccl-dev_1.2.3-1.cuda8.0_amd64.deb 

# Build caffe
RUN git clone https://github.com/nvidia/caffe.git /caffe -b 'caffe-0.16' && \
    cd /caffe && \
    pip install ipython==5.4.1 && \
    pip install tornado==4.5.3 && \
    pip install -r python/requirements.txt && \
    mkdir build && \
    cd build && \
    cmake -DCMAKE_INSTALL_PREFIX=/usr/local/caffe -DUSE_NCCL=ON -DUSE_CUDNN=ON -DCUDA_ARCH_NAME=Manual -DCUDA_ARCH_BIN="35 52 60 61" -DCUDA_ARCH_PTX="61" .. && \
    make -j"$(nproc)" install && \
    rm -rf /caffe


FROM nvidia/cuda:8.0-cudnn7-runtime-ubuntu14.04
RUN rm /etc/apt/sources.list.d/* && apt-get clean && apt-get update
RUN echo "deb http://security.ubuntu.com/ubuntu trusty-security main" >> /etc/apt/sources.list
RUN echo "deb http://cz.archive.ubuntu.com/ubuntu trusty main universe" >> /etc/apt/sources.list

LABEL maintainer "NVIDIA CORPORATION <cudatools@nvidia.com>"

ENV DIGITS_VERSION 6.0

LABEL com.nvidia.digits.version="6.0"

COPY --from=caffe /usr/local/caffe /usr/local
COPY --from=protobuf /usr/local/protobuf /usr/local

# Install the packages to get pip installed or else we run into numpy problems
RUN apt-get update && apt-get install -y \
        ca-certificates \
        curl gcc make && \
    rm -rf /var/lib/apt/lists/*

RUN apt update && apt install -y software-properties-common && rm -rf /var/lib/apt/lists/*
RUN add-apt-repository ppa:fkrull/deadsnakes-python2.7
RUN apt-get update && apt-get install -y --force-yes python-dev && \
    rm -rf /var/lib/apt/lists/*

WORKDIR /
# Build pip, need to do this before DIGITS packages or else we get numpy problems
RUN curl -O https://bootstrap.pypa.io/pip/2.7/get-pip.py && \
    python get-pip.py && \
    pip install --upgrade --no-cache-dir pip

RUN pip install urllib3[secure]


# For Ubuntu 14.04
ENV CUDA_REPO_PKG=http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1404/x86_64/cuda-repo-ubuntu1404_8.0.61-1_amd64.deb
ENV ML_REPO_PKG=http://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1404/x86_64/nvidia-machine-learning-repo-ubuntu1404_4.0-2_amd64.deb

RUN apt-get update && apt-get install -y --force-yes wget && \
    rm -rf /var/lib/apt/lists/*

# Install repo packages
RUN wget "$CUDA_REPO_PKG" -O /tmp/cuda-repo.deb && dpkg -i /tmp/cuda-repo.deb && rm -f /tmp/cuda-repo.deb
RUN wget "$ML_REPO_PKG" -O /tmp/ml-repo.deb && dpkg -i /tmp/ml-repo.deb && rm -f /tmp/ml-repo.deb

RUN apt-get update && apt-get install -y --force-yes --no-install-recommends g++ libtiff5-dev libjpeg8-dev zlib1g-dev libfreetype6-dev liblcms2-dev libwebp-dev tcl8.6-dev tk8.6-dev python-tk && \
    rm -rf /var/lib/apt/lists/*
    

RUN apt-get update && apt-get install -y --no-install-recommends \
        build-essential \
        git \
        graphviz \
        gunicorn \
        libatlas3-base \
        libboost-filesystem1.55.0 \
        libboost-python1.55.0 \
        libboost-system1.55.0 \
        libboost-thread1.55.0 \
        libboost-regex1.55.0 \
        libfreetype6-dev \
        libgoogle-glog0 \
        libhdf5-serial-dev \
        libleveldb1 \
        libnccl1=1.2.3-1+cuda8.0 \
        libopencv-core2.4 \
        libopencv-highgui2.4 \
        libopencv-imgproc2.4 \
        libpng12-dev \
        libzmq3 \
        nginx \
        pkg-config \
        python-dev \
        python-flask \
        python-flaskext.socketio \
        python-flaskext.wtf \
        rsync \
        software-properties-common \
        libturbojpeg libjpeg-turbo8-dev libopenblas-dev \
        torch7-nv=0.9.99-1+cuda8.0 && \
    rm -rf /var/lib/apt/lists/*



RUN pip install numpy==1.11.0
RUN pip install Werkzeug==0.16.1
RUN pip install scikit-fmm==0.0.9 python-socketio==1.4 setuptools==18.5
RUN pip install scikit-image
RUN pip install https://github.com/NVIDIA/DIGITS/archive/v6.0.1.tar.gz

RUN pip install --no-cache-dir \
        setuptools\>=18.5 \
        tensorflow-gpu==1.2.1 \
        protobuf==3.2.0




VOLUME /jobs

ENV DIGITS_JOBS_DIR=/jobs
ENV DIGITS_LOGFILE_FILENAME=/jobs/digits.log
ENV PYTHONPATH=/usr/local/python

# DIGITS
EXPOSE 5000

# TensorBoard
EXPOSE 6006

ENTRYPOINT ["python", "-m", "digits"]

And we have the following error when we start training:

I0406 16:41:24.519845   112 net.cpp:98] Using FLOAT as default forward math type
I0406 16:41:24.519855   112 net.cpp:104] Using FLOAT as default backward math type
I0406 16:41:24.519872   112 layer_factory.hpp:172] Creating layer 'train-data' of type 'Data'
I0406 16:41:24.519887   112 layer_factory.hpp:184] Layer's types are Ftype:FLOAT Btype:FLOAT Fmath:FLOAT Bmath:FLOAT
I0406 16:41:24.520851   112 internal_thread.cpp:19] Starting 1 internal thread(s) on device 0
I0406 16:41:24.559329   112 net.cpp:187] Created Layer train-data (0)
I0406 16:41:24.559688   164 blocking_queue.cpp:40] Data layer prefetch queue empty
I0406 16:41:24.559703   112 net.cpp:529] train-data -> data
I0406 16:41:24.559777   112 net.cpp:529] train-data -> label
I0406 16:41:24.560501   112 data_reader.cpp:55] Sample Data Reader threads: 1, out queues: 1, depth: 256
I0406 16:41:24.560678   112 internal_thread.cpp:19] Starting 1 internal thread(s) on device 0
I0406 16:41:24.561302   165 db_lmdb.cpp:36] Opened lmdb /jobs/20220128-163550-e2c3/train_db
I0406 16:41:24.566056   112 data_layer.cpp:197] [0] Output data size: 256, 3, 64, 64
I0406 16:41:24.566120   112 internal_thread.cpp:19] Starting 1 internal thread(s) on device 0
I0406 16:41:24.566231   112 net.cpp:247] Setting up train-data
I0406 16:41:24.566258   112 net.cpp:254] TRAIN Top shape for layer 0 'train-data' 256 3 64 64 (3145728)
I0406 16:41:24.566278   112 net.cpp:254] TRAIN Top shape for layer 0 'train-data' 256 (256)
I0406 16:41:24.566295   112 layer_factory.hpp:172] Creating layer 'data-augment' of type 'Python'
I0406 16:41:24.566309   112 layer_factory.hpp:184] Layer's types are Ftype:FLOAT Btype:FLOAT Fmath:FLOAT Bmath:FLOAT
I0406 16:41:24.567971   112 layer_factory.cpp:339] Importing Python module 'digits_python_layers'
I0406 16:41:24.571938   166 common.cpp:525] NVML initialized, thread 166
*** Aborted at 1649263285 (unix time) try "date -d @1649263285" if you are using GNU date ***
PC: @     0x7f667cbf458a (unknown)
*** SIGSEGV (@0x48) received by PID 112 (TID 0x7f667f5ebb80) from PID 72; stack trace: ***
@     0x7f667c22dcb0 (unknown)
@     0x7f667cbf458a (unknown)
@     0x7f667cc90359 (unknown)
@     0x7f667cb4edef (unknown)
@     0x7f667e36795c caffe::GetPythonLayer()
@     0x7f667e594ec1 caffe::LayerRegistry::CreateLayer()
@     0x7f667e5e9ab4 caffe::Net::Init()
@     0x7f667e5ec26f caffe::Net::Net()
@     0x7f667e5f6760 caffe::Solver::InitTrainNet()
@     0x7f667e5f6c25 caffe::Solver::Init()
@     0x7f667e5f709e caffe::Solver::Solver()
@     0x7f667e6016b6 caffe::Creator_AdamSolver()
@           0x4110a7 caffe::SolverRegistry::CreateSolver()
@           0x40bd0a train()
@           0x409870 main
@     0x7f667c218f45 (unknown)
@           0x40a1ab (unknown)
@                0x0 (unknown)

Whats happening?