For me, it tooks about 10s to execute
import mxnet as mx
in my python 3.6.6. The slow import procedure may due to the large mxnet.dll file, which is generated by
KNOWN_CUDA_ARCHS := 30 35 50 52 60 61 70 75
# Run nvcc on a zero-length file to check architecture-level support.
# Create args to include SASS in the fat binary for supported levels.
CUDA_ARCH := $(foreach arch,$(KNOWN_CUDA_ARCHS), \
$(shell $(NVCC) -arch=sm_$(arch) -E --x cu /dev/null >/dev/null 2>&1 && \
echo -gencode arch=compute_$(arch),code=sm_$(arch)))
# Convert a trailing "code=sm_NN" to "code=[sm_NN,compute_NN]" to also
# include the PTX of the most recent arch in the fat-binaries for
# forward compatibility with newer GPUs.
CUDA_ARCH := $(shell echo $(CUDA_ARCH) | sed 's/sm_\([0-9]*\)$$/[sm_\1,compute_\1]/')
# Add fat binary compression if supported by nvcc.
COMPRESS := --fatbin-options -compress-all
CUDA_ARCH += $(shell $(NVCC) -cuda $(COMPRESS) --x cu /dev/null -o /dev/null >/dev/null 2>&1 && \
echo $(COMPRESS))
These terrible instructions came up with a REALLY HUGE .dll file, mxnet.dll
Now, the question is, can we strip it for a specific arch, e.g., sm_61,compute_61
for my GTX 1060?