Still having AttributeError: 'NoneType' object has no attribute 'create_execution_context'

Hi
Sorry to bother you with this once more
I know there were a lot of posts regrading this error
But after spending a whole day, I still wasn’t able to get a good answer…

by running

python pytorch_model.py

will create an onnx file from pytorch model

then running

python build_engine.py

should give me a quantized (calibrated) model
but it seems like function “build_cuda_engine” doesn’t return anything…

I would appreciate you to have a look on the code

but in brief, this is how the code looks like

def build_engine(onnx_file_path, trt_logger, calibration_cache_path):
    builder = trt.Builder(trt_logger)
    network = builder.create_network()
    # last_layer = network.get_layer(network.num_layers - 1)
    # network.mark_output(last_layer.get_output(0))
    # network_creation_flag = 1 << int(trt.NetworkDefinitionCreaionFlag.EXPLICIT_BATCH)
    # network = builder.create_network(network_creation_flag)
    parser = trt.OnnxParser(network, trt_logger)
    builder.max_workspace_size = 1 << 30
    builder.max_batch_size = 1
    if builder.platform_has_fast_int8:
        builder.fp16_mode = True
        # builder.int8_mode = True

    if builder.int8_mode == True:
        NUM_IMAGES_PER_BATCH = 5

        calibration_files = create_calibration_dataset(args.calibration_path)

        batchstream = ImageBatchStream(NUM_IMAGES_PER_BATCH, calibration_files, preprocess_image)

        Int8_calibrator = PythonEntropyCalibrator(['input_node_name'], batchstream, calibration_cache_path)
        builder.int8_calibrator = Int8_calibrator

    with open(onnx_file_path, 'rb') as model:
        print('Beginning ONNX file parsing')
        parser.parse(model.read())
    print('Completed parsing of ONNX file')

    print("network.num_layers", network.num_layers)

    print('Building an engine...')
    engine = builder.build_cuda_engine(network)

    print(engine)
    context = engine.create_execution_context()
    print('Completed creating Engine')
    return engine, context

code.tar.gz (27.8 MB)

I have used docker from TensorRT | NVIDIA NGC
and only installed timm with “pip install timm” (which installs torch with 1.8.1 version, but I have also downgraded it to 1.5.0 and still won’t work)

Help much appreciated thanks!

Hi,
Request you to share the ONNX model and the script if not shared already so that we can assist you better.
Alongside you can try few things:

  1. validating your model with the below snippet

check_model.py

import sys
import onnx
filename = yourONNXmodel
model = onnx.load(filename)
onnx.checker.check_model(model).
2) Try running your model with trtexec command.
https://github.com/NVIDIA/TensorRT/tree/master/samples/opensource/trtexec
In case you are still facing issue, request you to share the trtexec “”–verbose"" log for further debugging
Thanks!

Hi
Thx for your rep
As you can see the code in my zip file i have already tried what you suggested (checking onnx file)

I also did include my onnx file in the tar file

I will appreciate if you have a look for me 😥🙏

This is how it looks like when I give verbose = True
thanks!

graph(%input : Float(1, 3, 240, 240, strides=[172800, 57600, 240, 1], requires_grad=0, device=cuda:0),
      %blocks.0.0.se.conv_reduce.weight : Float(8, 32, 1, 1, strides=[32, 1, 1, 1], requires_grad=1, device=cuda:0),
      %blocks.0.0.se.conv_reduce.bias : Float(8, strides=[1], requires_grad=1, device=cuda:0),
      %blocks.0.0.se.conv_expand.weight : Float(32, 8, 1, 1, strides=[8, 1, 1, 1], requires_grad=1, device=cuda:0),
      %blocks.0.0.se.conv_expand.bias : Float(32, strides=[1], requires_grad=1, device=cuda:0),
      %blocks.0.1.se.conv_reduce.weight : Float(4, 16, 1, 1, strides=[16, 1, 1, 1], requires_grad=1, device=cuda:0),
      %blocks.0.1.se.conv_reduce.bias : Float(4, strides=[1], requires_grad=1, device=cuda:0),
      %blocks.0.1.se.conv_expand.weight : Float(16, 4, 1, 1, strides=[4, 1, 1, 1], requires_grad=1, device=cuda:0),
      %blocks.0.1.se.conv_expand.bias : Float(16, strides=[1], requires_grad=1, device=cuda:0),
      %blocks.1.0.se.conv_reduce.weight : Float(4, 96, 1, 1, strides=[96, 1, 1, 1], requires_grad=1, device=cuda:0),
      %blocks.1.0.se.conv_reduce.bias : Float(4, strides=[1], requires_grad=1, device=cuda:0),
      %blocks.1.0.se.conv_expand.weight : Float(96, 4, 1, 1, strides=[4, 1, 1, 1], requires_grad=1, device=cuda:0),
      %blocks.1.0.se.conv_expand.bias : Float(96, strides=[1], requires_grad=1, device=cuda:0),
      %blocks.1.1.se.conv_reduce.weight : Float(6, 144, 1, 1, strides=[144, 1, 1, 1], requires_grad=1, device=cuda:0),
      %blocks.1.1.se.conv_reduce.bias : Float(6, strides=[1], requires_grad=1, device=cuda:0),
      %blocks.1.1.se.conv_expand.weight : Float(144, 6, 1, 1, strides=[6, 1, 1, 1], requires_grad=1, device=cuda:0),
      %blocks.1.1.se.conv_expand.bias : Float(144, strides=[1], requires_grad=1, device=cuda:0),
      %blocks.1.2.se.conv_reduce.weight : Float(6, 144, 1, 1, strides=[144, 1, 1, 1], requires_grad=1, device=cuda:0),
      %blocks.1.2.se.conv_reduce.bias : Float(6, strides=[1], requires_grad=1, device=cuda:0),
      %blocks.1.2.se.conv_expand.weight : Float(144, 6, 1, 1, strides=[6, 1, 1, 1], requires_grad=1, device=cuda:0),
      %blocks.1.2.se.conv_expand.bias : Float(144, strides=[1], requires_grad=1, device=cuda:0),
      %blocks.2.0.se.conv_reduce.weight : Float(6, 144, 1, 1, strides=[144, 1, 1, 1], requires_grad=1, device=cuda:0),
      %blocks.2.0.se.conv_reduce.bias : Float(6, strides=[1], requires_grad=1, device=cuda:0),
      %blocks.2.0.se.conv_expand.weight : Float(144, 6, 1, 1, strides=[6, 1, 1, 1], requires_grad=1, device=cuda:0),
      %blocks.2.0.se.conv_expand.bias : Float(144, strides=[1], requires_grad=1, device=cuda:0),
      %blocks.2.1.se.conv_reduce.weight : Float(10, 240, 1, 1, strides=[240, 1, 1, 1], requires_grad=1, device=cuda:0),
      %blocks.2.1.se.conv_reduce.bias : Float(10, strides=[1], requires_grad=1, device=cuda:0),
      %blocks.2.1.se.conv_expand.weight : Float(240, 10, 1, 1, strides=[10, 1, 1, 1], requires_grad=1, device=cuda:0),
      %blocks.2.1.se.conv_expand.bias : Float(240, strides=[1], requires_grad=1, device=cuda:0),
      %blocks.2.2.se.conv_reduce.weight : Float(10, 240, 1, 1, strides=[240, 1, 1, 1], requires_grad=1, device=cuda:0),
      %blocks.2.2.se.conv_reduce.bias : Float(10, strides=[1], requires_grad=1, device=cuda:0),
      %blocks.2.2.se.conv_expand.weight : Float(240, 10, 1, 1, strides=[10, 1, 1, 1], requires_grad=1, device=cuda:0),
      %blocks.2.2.se.conv_expand.bias : Float(240, strides=[1], requires_grad=1, device=cuda:0),
      %blocks.3.0.se.conv_reduce.weight : Float(10, 240, 1, 1, strides=[240, 1, 1, 1], requires_grad=1, device=cuda:0),
      %blocks.3.0.se.conv_reduce.bias : Float(10, strides=[1], requires_grad=1, device=cuda:0),
      %blocks.3.0.se.conv_expand.weight : Float(240, 10, 1, 1, strides=[10, 1, 1, 1], requires_grad=1, device=cuda:0),
      %blocks.3.0.se.conv_expand.bias : Float(240, strides=[1], requires_grad=1, device=cuda:0),
      %blocks.3.1.se.conv_reduce.weight : Float(20, 480, 1, 1, strides=[480, 1, 1, 1], requires_grad=1, device=cuda:0),
      %blocks.3.1.se.conv_reduce.bias : Float(20, strides=[1], requires_grad=1, device=cuda:0),
      %blocks.3.1.se.conv_expand.weight : Float(480, 20, 1, 1, strides=[20, 1, 1, 1], requires_grad=1, device=cuda:0),
      %blocks.3.1.se.conv_expand.bias : Float(480, strides=[1], requires_grad=1, device=cuda:0),
      %blocks.3.2.se.conv_reduce.weight : Float(20, 480, 1, 1, strides=[480, 1, 1, 1], requires_grad=1, device=cuda:0),
      %blocks.3.2.se.conv_reduce.bias : Float(20, strides=[1], requires_grad=1, device=cuda:0),
      %blocks.3.2.se.conv_expand.weight : Float(480, 20, 1, 1, strides=[20, 1, 1, 1], requires_grad=1, device=cuda:0),
      %blocks.3.2.se.conv_expand.bias : Float(480, strides=[1], requires_grad=1, device=cuda:0),
      %blocks.3.3.se.conv_reduce.weight : Float(20, 480, 1, 1, strides=[480, 1, 1, 1], requires_grad=1, device=cuda:0),
      %blocks.3.3.se.conv_reduce.bias : Float(20, strides=[1], requires_grad=1, device=cuda:0),
      %blocks.3.3.se.conv_expand.weight : Float(480, 20, 1, 1, strides=[20, 1, 1, 1], requires_grad=1, device=cuda:0),
      %blocks.3.3.se.conv_expand.bias : Float(480, strides=[1], requires_grad=1, device=cuda:0),
      %blocks.4.0.se.conv_reduce.weight : Float(20, 480, 1, 1, strides=[480, 1, 1, 1], requires_grad=1, device=cuda:0),
      %blocks.4.0.se.conv_reduce.bias : Float(20, strides=[1], requires_grad=1, device=cuda:0),
      %blocks.4.0.se.conv_expand.weight : Float(480, 20, 1, 1, strides=[20, 1, 1, 1], requires_grad=1, device=cuda:0),
      %blocks.4.0.se.conv_expand.bias : Float(480, strides=[1], requires_grad=1, device=cuda:0),
      %blocks.4.1.se.conv_reduce.weight : Float(28, 672, 1, 1, strides=[672, 1, 1, 1], requires_grad=1, device=cuda:0),
      %blocks.4.1.se.conv_reduce.bias : Float(28, strides=[1], requires_grad=1, device=cuda:0),
      %blocks.4.1.se.conv_expand.weight : Float(672, 28, 1, 1, strides=[28, 1, 1, 1], requires_grad=1, device=cuda:0),
      %blocks.4.1.se.conv_expand.bias : Float(672, strides=[1], requires_grad=1, device=cuda:0),
      %blocks.4.2.se.conv_reduce.weight : Float(28, 672, 1, 1, strides=[672, 1, 1, 1], requires_grad=1, device=cuda:0),
      %blocks.4.2.se.conv_reduce.bias : Float(28, strides=[1], requires_grad=1, device=cuda:0),
      %blocks.4.2.se.conv_expand.weight : Float(672, 28, 1, 1, strides=[28, 1, 1, 1], requires_grad=1, device=cuda:0),
      %blocks.4.2.se.conv_expand.bias : Float(672, strides=[1], requires_grad=1, device=cuda:0),
      %blocks.4.3.se.conv_reduce.weight : Float(28, 672, 1, 1, strides=[672, 1, 1, 1], requires_grad=1, device=cuda:0),
      %blocks.4.3.se.conv_reduce.bias : Float(28, strides=[1], requires_grad=1, device=cuda:0),
      %blocks.4.3.se.conv_expand.weight : Float(672, 28, 1, 1, strides=[28, 1, 1, 1], requires_grad=1, device=cuda:0),
      %blocks.4.3.se.conv_expand.bias : Float(672, strides=[1], requires_grad=1, device=cuda:0),
      %blocks.5.0.se.conv_reduce.weight : Float(28, 672, 1, 1, strides=[672, 1, 1, 1], requires_grad=1, device=cuda:0),
      %blocks.5.0.se.conv_reduce.bias : Float(28, strides=[1], requires_grad=1, device=cuda:0),
      %blocks.5.0.se.conv_expand.weight : Float(672, 28, 1, 1, strides=[28, 1, 1, 1], requires_grad=1, device=cuda:0),
      %blocks.5.0.se.conv_expand.bias : Float(672, strides=[1], requires_grad=1, device=cuda:0),
      %blocks.5.1.se.conv_reduce.weight : Float(48, 1152, 1, 1, strides=[1152, 1, 1, 1], requires_grad=1, device=cuda:0),
      %blocks.5.1.se.conv_reduce.bias : Float(48, strides=[1], requires_grad=1, device=cuda:0),
      %blocks.5.1.se.conv_expand.weight : Float(1152, 48, 1, 1, strides=[48, 1, 1, 1], requires_grad=1, device=cuda:0),
      %blocks.5.1.se.conv_expand.bias : Float(1152, strides=[1], requires_grad=1, device=cuda:0),
      %blocks.5.2.se.conv_reduce.weight : Float(48, 1152, 1, 1, strides=[1152, 1, 1, 1], requires_grad=1, device=cuda:0),
      %blocks.5.2.se.conv_reduce.bias : Float(48, strides=[1], requires_grad=1, device=cuda:0),
      %blocks.5.2.se.conv_expand.weight : Float(1152, 48, 1, 1, strides=[48, 1, 1, 1], requires_grad=1, device=cuda:0),
      %blocks.5.2.se.conv_expand.bias : Float(1152, strides=[1], requires_grad=1, device=cuda:0),
      %blocks.5.3.se.conv_reduce.weight : Float(48, 1152, 1, 1, strides=[1152, 1, 1, 1], requires_grad=1, device=cuda:0),
      %blocks.5.3.se.conv_reduce.bias : Float(48, strides=[1], requires_grad=1, device=cuda:0),
      %blocks.5.3.se.conv_expand.weight : Float(1152, 48, 1, 1, strides=[48, 1, 1, 1], requires_grad=1, device=cuda:0),
      %blocks.5.3.se.conv_expand.bias : Float(1152, strides=[1], requires_grad=1, device=cuda:0),
      %blocks.5.4.se.conv_reduce.weight : Float(48, 1152, 1, 1, strides=[1152, 1, 1, 1], requires_grad=1, device=cuda:0),
      %blocks.5.4.se.conv_reduce.bias : Float(48, strides=[1], requires_grad=1, device=cuda:0),
      %blocks.5.4.se.conv_expand.weight : Float(1152, 48, 1, 1, strides=[48, 1, 1, 1], requires_grad=1, device=cuda:0),
      %blocks.5.4.se.conv_expand.bias : Float(1152, strides=[1], requires_grad=1, device=cuda:0),
      %blocks.6.0.se.conv_reduce.weight : Float(48, 1152, 1, 1, strides=[1152, 1, 1, 1], requires_grad=1, device=cuda:0),
      %blocks.6.0.se.conv_reduce.bias : Float(48, strides=[1], requires_grad=1, device=cuda:0),
      %blocks.6.0.se.conv_expand.weight : Float(1152, 48, 1, 1, strides=[48, 1, 1, 1], requires_grad=1, device=cuda:0),
      %blocks.6.0.se.conv_expand.bias : Float(1152, strides=[1], requires_grad=1, device=cuda:0),
      %blocks.6.1.se.conv_reduce.weight : Float(80, 1920, 1, 1, strides=[1920, 1, 1, 1], requires_grad=1, device=cuda:0),
      %blocks.6.1.se.conv_reduce.bias : Float(80, strides=[1], requires_grad=1, device=cuda:0),
      %blocks.6.1.se.conv_expand.weight : Float(1920, 80, 1, 1, strides=[80, 1, 1, 1], requires_grad=1, device=cuda:0),
      %blocks.6.1.se.conv_expand.bias : Float(1920, strides=[1], requires_grad=1, device=cuda:0),
      %classifier.weight : Float(1000, 1280, strides=[1280, 1], requires_grad=1, device=cuda:0),
      %classifier.bias : Float(1000, strides=[1], requires_grad=1, device=cuda:0),
      %920 : Float(32, 3, 3, 3, strides=[27, 9, 3, 1], requires_grad=0, device=cuda:0),
      %921 : Float(32, strides=[1], requires_grad=0, device=cuda:0),
      %923 : Float(32, 1, 3, 3, strides=[9, 9, 3, 1], requires_grad=0, device=cuda:0),
      %924 : Float(32, strides=[1], requires_grad=0, device=cuda:0),
      %926 : Float(16, 32, 1, 1, strides=[32, 1, 1, 1], requires_grad=0, device=cuda:0),
      %927 : Float(16, strides=[1], requires_grad=0, device=cuda:0),
      %929 : Float(16, 1, 3, 3, strides=[9, 9, 3, 1], requires_grad=0, device=cuda:0),
      %930 : Float(16, strides=[1], requires_grad=0, device=cuda:0),
      %932 : Float(16, 16, 1, 1, strides=[16, 1, 1, 1], requires_grad=0, device=cuda:0),
      %933 : Float(16, strides=[1], requires_grad=0, device=cuda:0),
      %935 : Float(96, 16, 1, 1, strides=[16, 1, 1, 1], requires_grad=0, device=cuda:0),
      %936 : Float(96, strides=[1], requires_grad=0, device=cuda:0),
      %938 : Float(96, 1, 3, 3, strides=[9, 9, 3, 1], requires_grad=0, device=cuda:0),
      %939 : Float(96, strides=[1], requires_grad=0, device=cuda:0),
      %941 : Float(24, 96, 1, 1, strides=[96, 1, 1, 1], requires_grad=0, device=cuda:0),
      %942 : Float(24, strides=[1], requires_grad=0, device=cuda:0),
      %944 : Float(144, 24, 1, 1, strides=[24, 1, 1, 1], requires_grad=0, device=cuda:0),
      %945 : Float(144, strides=[1], requires_grad=0, device=cuda:0),
      %947 : Float(144, 1, 3, 3, strides=[9, 9, 3, 1], requires_grad=0, device=cuda:0),
      %948 : Float(144, strides=[1], requires_grad=0, device=cuda:0),
      %950 : Float(24, 144, 1, 1, strides=[144, 1, 1, 1], requires_grad=0, device=cuda:0),
      %951 : Float(24, strides=[1], requires_grad=0, device=cuda:0),
      %953 : Float(144, 24, 1, 1, strides=[24, 1, 1, 1], requires_grad=0, device=cuda:0),
      %954 : Float(144, strides=[1], requires_grad=0, device=cuda:0),
      %956 : Float(144, 1, 3, 3, strides=[9, 9, 3, 1], requires_grad=0, device=cuda:0),
      %957 : Float(144, strides=[1], requires_grad=0, device=cuda:0),
      %959 : Float(24, 144, 1, 1, strides=[144, 1, 1, 1], requires_grad=0, device=cuda:0),
      %960 : Float(24, strides=[1], requires_grad=0, device=cuda:0),
      %962 : Float(144, 24, 1, 1, strides=[24, 1, 1, 1], requires_grad=0, device=cuda:0),
      %963 : Float(144, strides=[1], requires_grad=0, device=cuda:0),
      %965 : Float(144, 1, 5, 5, strides=[25, 25, 5, 1], requires_grad=0, device=cuda:0),
      %966 : Float(144, strides=[1], requires_grad=0, device=cuda:0),
      %968 : Float(40, 144, 1, 1, strides=[144, 1, 1, 1], requires_grad=0, device=cuda:0),
      %969 : Float(40, strides=[1], requires_grad=0, device=cuda:0),
      %971 : Float(240, 40, 1, 1, strides=[40, 1, 1, 1], requires_grad=0, device=cuda:0),
      %972 : Float(240, strides=[1], requires_grad=0, device=cuda:0),
      %974 : Float(240, 1, 5, 5, strides=[25, 25, 5, 1], requires_grad=0, device=cuda:0),
      %975 : Float(240, strides=[1], requires_grad=0, device=cuda:0),
      %977 : Float(40, 240, 1, 1, strides=[240, 1, 1, 1], requires_grad=0, device=cuda:0),
      %978 : Float(40, strides=[1], requires_grad=0, device=cuda:0),
      %980 : Float(240, 40, 1, 1, strides=[40, 1, 1, 1], requires_grad=0, device=cuda:0),
      %981 : Float(240, strides=[1], requires_grad=0, device=cuda:0),
      %983 : Float(240, 1, 5, 5, strides=[25, 25, 5, 1], requires_grad=0, device=cuda:0),
      %984 : Float(240, strides=[1], requires_grad=0, device=cuda:0),
      %986 : Float(40, 240, 1, 1, strides=[240, 1, 1, 1], requires_grad=0, device=cuda:0),
      %987 : Float(40, strides=[1], requires_grad=0, device=cuda:0),
      %989 : Float(240, 40, 1, 1, strides=[40, 1, 1, 1], requires_grad=0, device=cuda:0),
      %990 : Float(240, strides=[1], requires_grad=0, device=cuda:0),
      %992 : Float(240, 1, 3, 3, strides=[9, 9, 3, 1], requires_grad=0, device=cuda:0),
      %993 : Float(240, strides=[1], requires_grad=0, device=cuda:0),
      %995 : Float(80, 240, 1, 1, strides=[240, 1, 1, 1], requires_grad=0, device=cuda:0),
      %996 : Float(80, strides=[1], requires_grad=0, device=cuda:0),
      %998 : Float(480, 80, 1, 1, strides=[80, 1, 1, 1], requires_grad=0, device=cuda:0),
      %999 : Float(480, strides=[1], requires_grad=0, device=cuda:0),
      %1001 : Float(480, 1, 3, 3, strides=[9, 9, 3, 1], requires_grad=0, device=cuda:0),
      %1002 : Float(480, strides=[1], requires_grad=0, device=cuda:0),
      %1004 : Float(80, 480, 1, 1, strides=[480, 1, 1, 1], requires_grad=0, device=cuda:0),
      %1005 : Float(80, strides=[1], requires_grad=0, device=cuda:0),
      %1007 : Float(480, 80, 1, 1, strides=[80, 1, 1, 1], requires_grad=0, device=cuda:0),
      %1008 : Float(480, strides=[1], requires_grad=0, device=cuda:0),
      %1010 : Float(480, 1, 3, 3, strides=[9, 9, 3, 1], requires_grad=0, device=cuda:0),
      %1011 : Float(480, strides=[1], requires_grad=0, device=cuda:0),
      %1013 : Float(80, 480, 1, 1, strides=[480, 1, 1, 1], requires_grad=0, device=cuda:0),
      %1014 : Float(80, strides=[1], requires_grad=0, device=cuda:0),
      %1016 : Float(480, 80, 1, 1, strides=[80, 1, 1, 1], requires_grad=0, device=cuda:0),
      %1017 : Float(480, strides=[1], requires_grad=0, device=cuda:0),
      %1019 : Float(480, 1, 3, 3, strides=[9, 9, 3, 1], requires_grad=0, device=cuda:0),
      %1020 : Float(480, strides=[1], requires_grad=0, device=cuda:0),
      %1022 : Float(80, 480, 1, 1, strides=[480, 1, 1, 1], requires_grad=0, device=cuda:0),
      %1023 : Float(80, strides=[1], requires_grad=0, device=cuda:0),
      %1025 : Float(480, 80, 1, 1, strides=[80, 1, 1, 1], requires_grad=0, device=cuda:0),
      %1026 : Float(480, strides=[1], requires_grad=0, device=cuda:0),
      %1028 : Float(480, 1, 5, 5, strides=[25, 25, 5, 1], requires_grad=0, device=cuda:0),
      %1029 : Float(480, strides=[1], requires_grad=0, device=cuda:0),
      %1031 : Float(112, 480, 1, 1, strides=[480, 1, 1, 1], requires_grad=0, device=cuda:0),
      %1032 : Float(112, strides=[1], requires_grad=0, device=cuda:0),
      %1034 : Float(672, 112, 1, 1, strides=[112, 1, 1, 1], requires_grad=0, device=cuda:0),
      %1035 : Float(672, strides=[1], requires_grad=0, device=cuda:0),
      %1037 : Float(672, 1, 5, 5, strides=[25, 25, 5, 1], requires_grad=0, device=cuda:0),
      %1038 : Float(672, strides=[1], requires_grad=0, device=cuda:0),
      %1040 : Float(112, 672, 1, 1, strides=[672, 1, 1, 1], requires_grad=0, device=cuda:0),
      %1041 : Float(112, strides=[1], requires_grad=0, device=cuda:0),
      %1043 : Float(672, 112, 1, 1, strides=[112, 1, 1, 1], requires_grad=0, device=cuda:0),
      %1044 : Float(672, strides=[1], requires_grad=0, device=cuda:0),
      %1046 : Float(672, 1, 5, 5, strides=[25, 25, 5, 1], requires_grad=0, device=cuda:0),
      %1047 : Float(672, strides=[1], requires_grad=0, device=cuda:0),
      %1049 : Float(112, 672, 1, 1, strides=[672, 1, 1, 1], requires_grad=0, device=cuda:0),
      %1050 : Float(112, strides=[1], requires_grad=0, device=cuda:0),
      %1052 : Float(672, 112, 1, 1, strides=[112, 1, 1, 1], requires_grad=0, device=cuda:0),
      %1053 : Float(672, strides=[1], requires_grad=0, device=cuda:0),
      %1055 : Float(672, 1, 5, 5, strides=[25, 25, 5, 1], requires_grad=0, device=cuda:0),
      %1056 : Float(672, strides=[1], requires_grad=0, device=cuda:0),
      %1058 : Float(112, 672, 1, 1, strides=[672, 1, 1, 1], requires_grad=0, device=cuda:0),
      %1059 : Float(112, strides=[1], requires_grad=0, device=cuda:0),
      %1061 : Float(672, 112, 1, 1, strides=[112, 1, 1, 1], requires_grad=0, device=cuda:0),
      %1062 : Float(672, strides=[1], requires_grad=0, device=cuda:0),
      %1064 : Float(672, 1, 5, 5, strides=[25, 25, 5, 1], requires_grad=0, device=cuda:0),
      %1065 : Float(672, strides=[1], requires_grad=0, device=cuda:0),
      %1067 : Float(192, 672, 1, 1, strides=[672, 1, 1, 1], requires_grad=0, device=cuda:0),
      %1068 : Float(192, strides=[1], requires_grad=0, device=cuda:0),
      %1070 : Float(1152, 192, 1, 1, strides=[192, 1, 1, 1], requires_grad=0, device=cuda:0),
      %1071 : Float(1152, strides=[1], requires_grad=0, device=cuda:0),
      %1073 : Float(1152, 1, 5, 5, strides=[25, 25, 5, 1], requires_grad=0, device=cuda:0),
      %1074 : Float(1152, strides=[1], requires_grad=0, device=cuda:0),
      %1076 : Float(192, 1152, 1, 1, strides=[1152, 1, 1, 1], requires_grad=0, device=cuda:0),
      %1077 : Float(192, strides=[1], requires_grad=0, device=cuda:0),
      %1079 : Float(1152, 192, 1, 1, strides=[192, 1, 1, 1], requires_grad=0, device=cuda:0),
      %1080 : Float(1152, strides=[1], requires_grad=0, device=cuda:0),
      %1082 : Float(1152, 1, 5, 5, strides=[25, 25, 5, 1], requires_grad=0, device=cuda:0),
      %1083 : Float(1152, strides=[1], requires_grad=0, device=cuda:0),
      %1085 : Float(192, 1152, 1, 1, strides=[1152, 1, 1, 1], requires_grad=0, device=cuda:0),
      %1086 : Float(192, strides=[1], requires_grad=0, device=cuda:0),
      %1088 : Float(1152, 192, 1, 1, strides=[192, 1, 1, 1], requires_grad=0, device=cuda:0),
      %1089 : Float(1152, strides=[1], requires_grad=0, device=cuda:0),
      %1091 : Float(1152, 1, 5, 5, strides=[25, 25, 5, 1], requires_grad=0, device=cuda:0),
      %1092 : Float(1152, strides=[1], requires_grad=0, device=cuda:0),
      %1094 : Float(192, 1152, 1, 1, strides=[1152, 1, 1, 1], requires_grad=0, device=cuda:0),
      %1095 : Float(192, strides=[1], requires_grad=0, device=cuda:0),
      %1097 : Float(1152, 192, 1, 1, strides=[192, 1, 1, 1], requires_grad=0, device=cuda:0),
      %1098 : Float(1152, strides=[1], requires_grad=0, device=cuda:0),
      %1100 : Float(1152, 1, 5, 5, strides=[25, 25, 5, 1], requires_grad=0, device=cuda:0),
      %1101 : Float(1152, strides=[1], requires_grad=0, device=cuda:0),
      %1103 : Float(192, 1152, 1, 1, strides=[1152, 1, 1, 1], requires_grad=0, device=cuda:0),
      %1104 : Float(192, strides=[1], requires_grad=0, device=cuda:0),
      %1106 : Float(1152, 192, 1, 1, strides=[192, 1, 1, 1], requires_grad=0, device=cuda:0),
      %1107 : Float(1152, strides=[1], requires_grad=0, device=cuda:0),
      %1109 : Float(1152, 1, 3, 3, strides=[9, 9, 3, 1], requires_grad=0, device=cuda:0),
      %1110 : Float(1152, strides=[1], requires_grad=0, device=cuda:0),
      %1112 : Float(320, 1152, 1, 1, strides=[1152, 1, 1, 1], requires_grad=0, device=cuda:0),
      %1113 : Float(320, strides=[1], requires_grad=0, device=cuda:0),
      %1115 : Float(1920, 320, 1, 1, strides=[320, 1, 1, 1], requires_grad=0, device=cuda:0),
      %1116 : Float(1920, strides=[1], requires_grad=0, device=cuda:0),
      %1118 : Float(1920, 1, 3, 3, strides=[9, 9, 3, 1], requires_grad=0, device=cuda:0),
      %1119 : Float(1920, strides=[1], requires_grad=0, device=cuda:0),
      %1121 : Float(320, 1920, 1, 1, strides=[1920, 1, 1, 1], requires_grad=0, device=cuda:0),
      %1122 : Float(320, strides=[1], requires_grad=0, device=cuda:0),
      %1124 : Float(1280, 320, 1, 1, strides=[320, 1, 1, 1], requires_grad=0, device=cuda:0),
      %1125 : Float(1280, strides=[1], requires_grad=0, device=cuda:0)):
  %919 : Float(1, 32, 120, 120, strides=[460800, 14400, 120, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[3, 3], pads=[1, 1, 1, 1], strides=[2, 2]](%input, %920, %921)
  %511 : Float(1, 32, 120, 120, strides=[460800, 14400, 120, 1], device=cpu) = onnx::Sigmoid(%919)
  %512 : Float(1, 32, 120, 120, strides=[460800, 14400, 120, 1], requires_grad=1, device=cuda:0) = onnx::Mul(%919, %511) # /usr/local/lib/python3.8/dist-packages/torch/nn/functional.py:1795:0
  %922 : Float(1, 32, 120, 120, strides=[460800, 14400, 120, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=32, kernel_shape=[3, 3], pads=[1, 1, 1, 1], strides=[1, 1]](%512, %923, %924)
  %515 : Float(1, 32, 120, 120, strides=[460800, 14400, 120, 1], device=cpu) = onnx::Sigmoid(%922)
  %516 : Float(1, 32, 120, 120, strides=[460800, 14400, 120, 1], requires_grad=1, device=cuda:0) = onnx::Mul(%922, %515) # /usr/local/lib/python3.8/dist-packages/torch/nn/functional.py:1795:0
  %517 : Float(1, 32, 1, 1, strides=[32, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::ReduceMean[axes=[2, 3], keepdims=1](%516) # /usr/local/lib/python3.8/dist-packages/timm/models/efficientnet_blocks.py:44:0
  %518 : Float(1, 8, 1, 1, strides=[8, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[1, 1], pads=[0, 0, 0, 0], strides=[1, 1]](%517, %blocks.0.0.se.conv_reduce.weight, %blocks.0.0.se.conv_reduce.bias) # /usr/local/lib/python3.8/dist-packages/torch/nn/modules/conv.py:395:0
  %519 : Float(1, 8, 1, 1, strides=[8, 1, 1, 1], device=cpu) = onnx::Sigmoid(%518)
  %520 : Float(1, 8, 1, 1, strides=[8, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::Mul(%518, %519) # /usr/local/lib/python3.8/dist-packages/torch/nn/functional.py:1795:0
  %521 : Float(1, 32, 1, 1, strides=[32, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[1, 1], pads=[0, 0, 0, 0], strides=[1, 1]](%520, %blocks.0.0.se.conv_expand.weight, %blocks.0.0.se.conv_expand.bias) # /usr/local/lib/python3.8/dist-packages/torch/nn/modules/conv.py:395:0
  %522 : Float(1, 32, 1, 1, strides=[32, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::Sigmoid(%521) # /usr/local/lib/python3.8/dist-packages/timm/models/layers/activations.py:47:0
  %523 : Float(1, 32, 120, 120, strides=[460800, 14400, 120, 1], requires_grad=1, device=cuda:0) = onnx::Mul(%516, %522) # /usr/local/lib/python3.8/dist-packages/timm/models/efficientnet_blocks.py:48:0
  %925 : Float(1, 16, 120, 120, strides=[230400, 14400, 120, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[1, 1], pads=[0, 0, 0, 0], strides=[1, 1]](%523, %926, %927)
  %928 : Float(1, 16, 120, 120, strides=[230400, 14400, 120, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=16, kernel_shape=[3, 3], pads=[1, 1, 1, 1], strides=[1, 1]](%925, %929, %930)
  %528 : Float(1, 16, 120, 120, strides=[230400, 14400, 120, 1], device=cpu) = onnx::Sigmoid(%928)
  %529 : Float(1, 16, 120, 120, strides=[230400, 14400, 120, 1], requires_grad=1, device=cuda:0) = onnx::Mul(%928, %528) # /usr/local/lib/python3.8/dist-packages/torch/nn/functional.py:1795:0
  %530 : Float(1, 16, 1, 1, strides=[16, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::ReduceMean[axes=[2, 3], keepdims=1](%529) # /usr/local/lib/python3.8/dist-packages/timm/models/efficientnet_blocks.py:44:0
  %531 : Float(1, 4, 1, 1, strides=[4, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[1, 1], pads=[0, 0, 0, 0], strides=[1, 1]](%530, %blocks.0.1.se.conv_reduce.weight, %blocks.0.1.se.conv_reduce.bias) # /usr/local/lib/python3.8/dist-packages/torch/nn/modules/conv.py:395:0
  %532 : Float(1, 4, 1, 1, strides=[4, 1, 1, 1], device=cpu) = onnx::Sigmoid(%531)
  %533 : Float(1, 4, 1, 1, strides=[4, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::Mul(%531, %532) # /usr/local/lib/python3.8/dist-packages/torch/nn/functional.py:1795:0
  %534 : Float(1, 16, 1, 1, strides=[16, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[1, 1], pads=[0, 0, 0, 0], strides=[1, 1]](%533, %blocks.0.1.se.conv_expand.weight, %blocks.0.1.se.conv_expand.bias) # /usr/local/lib/python3.8/dist-packages/torch/nn/modules/conv.py:395:0
  %535 : Float(1, 16, 1, 1, strides=[16, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::Sigmoid(%534) # /usr/local/lib/python3.8/dist-packages/timm/models/layers/activations.py:47:0
  %536 : Float(1, 16, 120, 120, strides=[230400, 14400, 120, 1], requires_grad=1, device=cuda:0) = onnx::Mul(%529, %535) # /usr/local/lib/python3.8/dist-packages/timm/models/efficientnet_blocks.py:48:0
  %931 : Float(1, 16, 120, 120, strides=[230400, 14400, 120, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[1, 1], pads=[0, 0, 0, 0], strides=[1, 1]](%536, %932, %933)
  %539 : Float(1, 16, 120, 120, strides=[230400, 14400, 120, 1], requires_grad=1, device=cuda:0) = onnx::Add(%931, %925) # /usr/local/lib/python3.8/dist-packages/timm/models/efficientnet_blocks.py:133:0
  %934 : Float(1, 96, 120, 120, strides=[1382400, 14400, 120, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[1, 1], pads=[0, 0, 0, 0], strides=[1, 1]](%539, %935, %936)
  %542 : Float(1, 96, 120, 120, strides=[1382400, 14400, 120, 1], device=cpu) = onnx::Sigmoid(%934)
  %543 : Float(1, 96, 120, 120, strides=[1382400, 14400, 120, 1], requires_grad=1, device=cuda:0) = onnx::Mul(%934, %542) # /usr/local/lib/python3.8/dist-packages/torch/nn/functional.py:1795:0
  %937 : Float(1, 96, 60, 60, strides=[345600, 3600, 60, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=96, kernel_shape=[3, 3], pads=[1, 1, 1, 1], strides=[2, 2]](%543, %938, %939)
  %546 : Float(1, 96, 60, 60, strides=[345600, 3600, 60, 1], device=cpu) = onnx::Sigmoid(%937)
  %547 : Float(1, 96, 60, 60, strides=[345600, 3600, 60, 1], requires_grad=1, device=cuda:0) = onnx::Mul(%937, %546) # /usr/local/lib/python3.8/dist-packages/torch/nn/functional.py:1795:0
  %548 : Float(1, 96, 1, 1, strides=[96, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::ReduceMean[axes=[2, 3], keepdims=1](%547) # /usr/local/lib/python3.8/dist-packages/timm/models/efficientnet_blocks.py:44:0
  %549 : Float(1, 4, 1, 1, strides=[4, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[1, 1], pads=[0, 0, 0, 0], strides=[1, 1]](%548, %blocks.1.0.se.conv_reduce.weight, %blocks.1.0.se.conv_reduce.bias) # /usr/local/lib/python3.8/dist-packages/torch/nn/modules/conv.py:395:0
  %550 : Float(1, 4, 1, 1, strides=[4, 1, 1, 1], device=cpu) = onnx::Sigmoid(%549)
  %551 : Float(1, 4, 1, 1, strides=[4, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::Mul(%549, %550) # /usr/local/lib/python3.8/dist-packages/torch/nn/functional.py:1795:0
  %552 : Float(1, 96, 1, 1, strides=[96, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[1, 1], pads=[0, 0, 0, 0], strides=[1, 1]](%551, %blocks.1.0.se.conv_expand.weight, %blocks.1.0.se.conv_expand.bias) # /usr/local/lib/python3.8/dist-packages/torch/nn/modules/conv.py:395:0
  %553 : Float(1, 96, 1, 1, strides=[96, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::Sigmoid(%552) # /usr/local/lib/python3.8/dist-packages/timm/models/layers/activations.py:47:0
  %554 : Float(1, 96, 60, 60, strides=[345600, 3600, 60, 1], requires_grad=1, device=cuda:0) = onnx::Mul(%547, %553) # /usr/local/lib/python3.8/dist-packages/timm/models/efficientnet_blocks.py:48:0
  %940 : Float(1, 24, 60, 60, strides=[86400, 3600, 60, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[1, 1], pads=[0, 0, 0, 0], strides=[1, 1]](%554, %941, %942)
  %943 : Float(1, 144, 60, 60, strides=[518400, 3600, 60, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[1, 1], pads=[0, 0, 0, 0], strides=[1, 1]](%940, %944, %945)
  %559 : Float(1, 144, 60, 60, strides=[518400, 3600, 60, 1], device=cpu) = onnx::Sigmoid(%943)
  %560 : Float(1, 144, 60, 60, strides=[518400, 3600, 60, 1], requires_grad=1, device=cuda:0) = onnx::Mul(%943, %559) # /usr/local/lib/python3.8/dist-packages/torch/nn/functional.py:1795:0
  %946 : Float(1, 144, 60, 60, strides=[518400, 3600, 60, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=144, kernel_shape=[3, 3], pads=[1, 1, 1, 1], strides=[1, 1]](%560, %947, %948)
  %563 : Float(1, 144, 60, 60, strides=[518400, 3600, 60, 1], device=cpu) = onnx::Sigmoid(%946)
  %564 : Float(1, 144, 60, 60, strides=[518400, 3600, 60, 1], requires_grad=1, device=cuda:0) = onnx::Mul(%946, %563) # /usr/local/lib/python3.8/dist-packages/torch/nn/functional.py:1795:0
  %565 : Float(1, 144, 1, 1, strides=[144, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::ReduceMean[axes=[2, 3], keepdims=1](%564) # /usr/local/lib/python3.8/dist-packages/timm/models/efficientnet_blocks.py:44:0
  %566 : Float(1, 6, 1, 1, strides=[6, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[1, 1], pads=[0, 0, 0, 0], strides=[1, 1]](%565, %blocks.1.1.se.conv_reduce.weight, %blocks.1.1.se.conv_reduce.bias) # /usr/local/lib/python3.8/dist-packages/torch/nn/modules/conv.py:395:0
  %567 : Float(1, 6, 1, 1, strides=[6, 1, 1, 1], device=cpu) = onnx::Sigmoid(%566)
  %568 : Float(1, 6, 1, 1, strides=[6, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::Mul(%566, %567) # /usr/local/lib/python3.8/dist-packages/torch/nn/functional.py:1795:0
  %569 : Float(1, 144, 1, 1, strides=[144, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[1, 1], pads=[0, 0, 0, 0], strides=[1, 1]](%568, %blocks.1.1.se.conv_expand.weight, %blocks.1.1.se.conv_expand.bias) # /usr/local/lib/python3.8/dist-packages/torch/nn/modules/conv.py:395:0
  %570 : Float(1, 144, 1, 1, strides=[144, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::Sigmoid(%569) # /usr/local/lib/python3.8/dist-packages/timm/models/layers/activations.py:47:0
  %571 : Float(1, 144, 60, 60, strides=[518400, 3600, 60, 1], requires_grad=1, device=cuda:0) = onnx::Mul(%564, %570) # /usr/local/lib/python3.8/dist-packages/timm/models/efficientnet_blocks.py:48:0
  %949 : Float(1, 24, 60, 60, strides=[86400, 3600, 60, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[1, 1], pads=[0, 0, 0, 0], strides=[1, 1]](%571, %950, %951)
  %574 : Float(1, 24, 60, 60, strides=[86400, 3600, 60, 1], requires_grad=1, device=cuda:0) = onnx::Add(%949, %940)
  %952 : Float(1, 144, 60, 60, strides=[518400, 3600, 60, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[1, 1], pads=[0, 0, 0, 0], strides=[1, 1]](%574, %953, %954)
  %577 : Float(1, 144, 60, 60, strides=[518400, 3600, 60, 1], device=cpu) = onnx::Sigmoid(%952)
  %578 : Float(1, 144, 60, 60, strides=[518400, 3600, 60, 1], requires_grad=1, device=cuda:0) = onnx::Mul(%952, %577) # /usr/local/lib/python3.8/dist-packages/torch/nn/functional.py:1795:0
  %955 : Float(1, 144, 60, 60, strides=[518400, 3600, 60, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=144, kernel_shape=[3, 3], pads=[1, 1, 1, 1], strides=[1, 1]](%578, %956, %957)
  %581 : Float(1, 144, 60, 60, strides=[518400, 3600, 60, 1], device=cpu) = onnx::Sigmoid(%955)
  %582 : Float(1, 144, 60, 60, strides=[518400, 3600, 60, 1], requires_grad=1, device=cuda:0) = onnx::Mul(%955, %581) # /usr/local/lib/python3.8/dist-packages/torch/nn/functional.py:1795:0
  %583 : Float(1, 144, 1, 1, strides=[144, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::ReduceMean[axes=[2, 3], keepdims=1](%582) # /usr/local/lib/python3.8/dist-packages/timm/models/efficientnet_blocks.py:44:0
  %584 : Float(1, 6, 1, 1, strides=[6, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[1, 1], pads=[0, 0, 0, 0], strides=[1, 1]](%583, %blocks.1.2.se.conv_reduce.weight, %blocks.1.2.se.conv_reduce.bias) # /usr/local/lib/python3.8/dist-packages/torch/nn/modules/conv.py:395:0
  %585 : Float(1, 6, 1, 1, strides=[6, 1, 1, 1], device=cpu) = onnx::Sigmoid(%584)
  %586 : Float(1, 6, 1, 1, strides=[6, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::Mul(%584, %585) # /usr/local/lib/python3.8/dist-packages/torch/nn/functional.py:1795:0
  %587 : Float(1, 144, 1, 1, strides=[144, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[1, 1], pads=[0, 0, 0, 0], strides=[1, 1]](%586, %blocks.1.2.se.conv_expand.weight, %blocks.1.2.se.conv_expand.bias) # /usr/local/lib/python3.8/dist-packages/torch/nn/modules/conv.py:395:0
  %588 : Float(1, 144, 1, 1, strides=[144, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::Sigmoid(%587) # /usr/local/lib/python3.8/dist-packages/timm/models/layers/activations.py:47:0
  %589 : Float(1, 144, 60, 60, strides=[518400, 3600, 60, 1], requires_grad=1, device=cuda:0) = onnx::Mul(%582, %588) # /usr/local/lib/python3.8/dist-packages/timm/models/efficientnet_blocks.py:48:0
  %958 : Float(1, 24, 60, 60, strides=[86400, 3600, 60, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[1, 1], pads=[0, 0, 0, 0], strides=[1, 1]](%589, %959, %960)
  %592 : Float(1, 24, 60, 60, strides=[86400, 3600, 60, 1], requires_grad=1, device=cuda:0) = onnx::Add(%958, %574) # /usr/local/lib/python3.8/dist-packages/timm/models/efficientnet_blocks.py:208:0
  %961 : Float(1, 144, 60, 60, strides=[518400, 3600, 60, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[1, 1], pads=[0, 0, 0, 0], strides=[1, 1]](%592, %962, %963)
  %595 : Float(1, 144, 60, 60, strides=[518400, 3600, 60, 1], device=cpu) = onnx::Sigmoid(%961)
  %596 : Float(1, 144, 60, 60, strides=[518400, 3600, 60, 1], requires_grad=1, device=cuda:0) = onnx::Mul(%961, %595) # /usr/local/lib/python3.8/dist-packages/torch/nn/functional.py:1795:0
  %964 : Float(1, 144, 30, 30, strides=[129600, 900, 30, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=144, kernel_shape=[5, 5], pads=[2, 2, 2, 2], strides=[2, 2]](%596, %965, %966)
  %599 : Float(1, 144, 30, 30, strides=[129600, 900, 30, 1], device=cpu) = onnx::Sigmoid(%964)
  %600 : Float(1, 144, 30, 30, strides=[129600, 900, 30, 1], requires_grad=1, device=cuda:0) = onnx::Mul(%964, %599) # /usr/local/lib/python3.8/dist-packages/torch/nn/functional.py:1795:0
  %601 : Float(1, 144, 1, 1, strides=[144, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::ReduceMean[axes=[2, 3], keepdims=1](%600) # /usr/local/lib/python3.8/dist-packages/timm/models/efficientnet_blocks.py:44:0
  %602 : Float(1, 6, 1, 1, strides=[6, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[1, 1], pads=[0, 0, 0, 0], strides=[1, 1]](%601, %blocks.2.0.se.conv_reduce.weight, %blocks.2.0.se.conv_reduce.bias) # /usr/local/lib/python3.8/dist-packages/torch/nn/modules/conv.py:395:0
  %603 : Float(1, 6, 1, 1, strides=[6, 1, 1, 1], device=cpu) = onnx::Sigmoid(%602)
  %604 : Float(1, 6, 1, 1, strides=[6, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::Mul(%602, %603) # /usr/local/lib/python3.8/dist-packages/torch/nn/functional.py:1795:0
  %605 : Float(1, 144, 1, 1, strides=[144, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[1, 1], pads=[0, 0, 0, 0], strides=[1, 1]](%604, %blocks.2.0.se.conv_expand.weight, %blocks.2.0.se.conv_expand.bias) # /usr/local/lib/python3.8/dist-packages/torch/nn/modules/conv.py:395:0
  %606 : Float(1, 144, 1, 1, strides=[144, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::Sigmoid(%605) # /usr/local/lib/python3.8/dist-packages/timm/models/layers/activations.py:47:0
  %607 : Float(1, 144, 30, 30, strides=[129600, 900, 30, 1], requires_grad=1, device=cuda:0) = onnx::Mul(%600, %606) # /usr/local/lib/python3.8/dist-packages/timm/models/efficientnet_blocks.py:48:0
  %967 : Float(1, 40, 30, 30, strides=[36000, 900, 30, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[1, 1], pads=[0, 0, 0, 0], strides=[1, 1]](%607, %968, %969)
  %970 : Float(1, 240, 30, 30, strides=[216000, 900, 30, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[1, 1], pads=[0, 0, 0, 0], strides=[1, 1]](%967, %971, %972)
  %612 : Float(1, 240, 30, 30, strides=[216000, 900, 30, 1], device=cpu) = onnx::Sigmoid(%970)
  %613 : Float(1, 240, 30, 30, strides=[216000, 900, 30, 1], requires_grad=1, device=cuda:0) = onnx::Mul(%970, %612) # /usr/local/lib/python3.8/dist-packages/torch/nn/functional.py:1795:0
  %973 : Float(1, 240, 30, 30, strides=[216000, 900, 30, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=240, kernel_shape=[5, 5], pads=[2, 2, 2, 2], strides=[1, 1]](%613, %974, %975)
  %616 : Float(1, 240, 30, 30, strides=[216000, 900, 30, 1], device=cpu) = onnx::Sigmoid(%973)
  %617 : Float(1, 240, 30, 30, strides=[216000, 900, 30, 1], requires_grad=1, device=cuda:0) = onnx::Mul(%973, %616) # /usr/local/lib/python3.8/dist-packages/torch/nn/functional.py:1795:0
  %618 : Float(1, 240, 1, 1, strides=[240, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::ReduceMean[axes=[2, 3], keepdims=1](%617) # /usr/local/lib/python3.8/dist-packages/timm/models/efficientnet_blocks.py:44:0
  %619 : Float(1, 10, 1, 1, strides=[10, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[1, 1], pads=[0, 0, 0, 0], strides=[1, 1]](%618, %blocks.2.1.se.conv_reduce.weight, %blocks.2.1.se.conv_reduce.bias) # /usr/local/lib/python3.8/dist-packages/torch/nn/modules/conv.py:395:0
  %620 : Float(1, 10, 1, 1, strides=[10, 1, 1, 1], device=cpu) = onnx::Sigmoid(%619)
  %621 : Float(1, 10, 1, 1, strides=[10, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::Mul(%619, %620) # /usr/local/lib/python3.8/dist-packages/torch/nn/functional.py:1795:0
  %622 : Float(1, 240, 1, 1, strides=[240, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[1, 1], pads=[0, 0, 0, 0], strides=[1, 1]](%621, %blocks.2.1.se.conv_expand.weight, %blocks.2.1.se.conv_expand.bias) # /usr/local/lib/python3.8/dist-packages/torch/nn/modules/conv.py:395:0
  %623 : Float(1, 240, 1, 1, strides=[240, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::Sigmoid(%622) # /usr/local/lib/python3.8/dist-packages/timm/models/layers/activations.py:47:0
  %624 : Float(1, 240, 30, 30, strides=[216000, 900, 30, 1], requires_grad=1, device=cuda:0) = onnx::Mul(%617, %623) # /usr/local/lib/python3.8/dist-packages/timm/models/efficientnet_blocks.py:48:0
  %976 : Float(1, 40, 30, 30, strides=[36000, 900, 30, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[1, 1], pads=[0, 0, 0, 0], strides=[1, 1]](%624, %977, %978)
  %627 : Float(1, 40, 30, 30, strides=[36000, 900, 30, 1], requires_grad=1, device=cuda:0) = onnx::Add(%976, %967)
  %979 : Float(1, 240, 30, 30, strides=[216000, 900, 30, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[1, 1], pads=[0, 0, 0, 0], strides=[1, 1]](%627, %980, %981)
  %630 : Float(1, 240, 30, 30, strides=[216000, 900, 30, 1], device=cpu) = onnx::Sigmoid(%979)
  %631 : Float(1, 240, 30, 30, strides=[216000, 900, 30, 1], requires_grad=1, device=cuda:0) = onnx::Mul(%979, %630) # /usr/local/lib/python3.8/dist-packages/torch/nn/functional.py:1795:0
  %982 : Float(1, 240, 30, 30, strides=[216000, 900, 30, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=240, kernel_shape=[5, 5], pads=[2, 2, 2, 2], strides=[1, 1]](%631, %983, %984)
  %634 : Float(1, 240, 30, 30, strides=[216000, 900, 30, 1], device=cpu) = onnx::Sigmoid(%982)
  %635 : Float(1, 240, 30, 30, strides=[216000, 900, 30, 1], requires_grad=1, device=cuda:0) = onnx::Mul(%982, %634) # /usr/local/lib/python3.8/dist-packages/torch/nn/functional.py:1795:0
  %636 : Float(1, 240, 1, 1, strides=[240, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::ReduceMean[axes=[2, 3], keepdims=1](%635) # /usr/local/lib/python3.8/dist-packages/timm/models/efficientnet_blocks.py:44:0
  %637 : Float(1, 10, 1, 1, strides=[10, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[1, 1], pads=[0, 0, 0, 0], strides=[1, 1]](%636, %blocks.2.2.se.conv_reduce.weight, %blocks.2.2.se.conv_reduce.bias) # /usr/local/lib/python3.8/dist-packages/torch/nn/modules/conv.py:395:0
  %638 : Float(1, 10, 1, 1, strides=[10, 1, 1, 1], device=cpu) = onnx::Sigmoid(%637)
  %639 : Float(1, 10, 1, 1, strides=[10, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::Mul(%637, %638) # /usr/local/lib/python3.8/dist-packages/torch/nn/functional.py:1795:0
  %640 : Float(1, 240, 1, 1, strides=[240, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[1, 1], pads=[0, 0, 0, 0], strides=[1, 1]](%639, %blocks.2.2.se.conv_expand.weight, %blocks.2.2.se.conv_expand.bias) # /usr/local/lib/python3.8/dist-packages/torch/nn/modules/conv.py:395:0
  %641 : Float(1, 240, 1, 1, strides=[240, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::Sigmoid(%640) # /usr/local/lib/python3.8/dist-packages/timm/models/layers/activations.py:47:0
  %642 : Float(1, 240, 30, 30, strides=[216000, 900, 30, 1], requires_grad=1, device=cuda:0) = onnx::Mul(%635, %641) # /usr/local/lib/python3.8/dist-packages/timm/models/efficientnet_blocks.py:48:0
  %985 : Float(1, 40, 30, 30, strides=[36000, 900, 30, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[1, 1], pads=[0, 0, 0, 0], strides=[1, 1]](%642, %986, %987)
  %645 : Float(1, 40, 30, 30, strides=[36000, 900, 30, 1], requires_grad=1, device=cuda:0) = onnx::Add(%985, %627) # /usr/local/lib/python3.8/dist-packages/timm/models/efficientnet_blocks.py:208:0
  %988 : Float(1, 240, 30, 30, strides=[216000, 900, 30, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[1, 1], pads=[0, 0, 0, 0], strides=[1, 1]](%645, %989, %990)
  %648 : Float(1, 240, 30, 30, strides=[216000, 900, 30, 1], device=cpu) = onnx::Sigmoid(%988)
  %649 : Float(1, 240, 30, 30, strides=[216000, 900, 30, 1], requires_grad=1, device=cuda:0) = onnx::Mul(%988, %648) # /usr/local/lib/python3.8/dist-packages/torch/nn/functional.py:1795:0
  %991 : Float(1, 240, 15, 15, strides=[54000, 225, 15, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=240, kernel_shape=[3, 3], pads=[1, 1, 1, 1], strides=[2, 2]](%649, %992, %993)
  %652 : Float(1, 240, 15, 15, strides=[54000, 225, 15, 1], device=cpu) = onnx::Sigmoid(%991)
  %653 : Float(1, 240, 15, 15, strides=[54000, 225, 15, 1], requires_grad=1, device=cuda:0) = onnx::Mul(%991, %652) # /usr/local/lib/python3.8/dist-packages/torch/nn/functional.py:1795:0
  %654 : Float(1, 240, 1, 1, strides=[240, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::ReduceMean[axes=[2, 3], keepdims=1](%653) # /usr/local/lib/python3.8/dist-packages/timm/models/efficientnet_blocks.py:44:0
  %655 : Float(1, 10, 1, 1, strides=[10, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[1, 1], pads=[0, 0, 0, 0], strides=[1, 1]](%654, %blocks.3.0.se.conv_reduce.weight, %blocks.3.0.se.conv_reduce.bias) # /usr/local/lib/python3.8/dist-packages/torch/nn/modules/conv.py:395:0
  %656 : Float(1, 10, 1, 1, strides=[10, 1, 1, 1], device=cpu) = onnx::Sigmoid(%655)
  %657 : Float(1, 10, 1, 1, strides=[10, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::Mul(%655, %656) # /usr/local/lib/python3.8/dist-packages/torch/nn/functional.py:1795:0
  %658 : Float(1, 240, 1, 1, strides=[240, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[1, 1], pads=[0, 0, 0, 0], strides=[1, 1]](%657, %blocks.3.0.se.conv_expand.weight, %blocks.3.0.se.conv_expand.bias) # /usr/local/lib/python3.8/dist-packages/torch/nn/modules/conv.py:395:0
  %659 : Float(1, 240, 1, 1, strides=[240, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::Sigmoid(%658) # /usr/local/lib/python3.8/dist-packages/timm/models/layers/activations.py:47:0
  %660 : Float(1, 240, 15, 15, strides=[54000, 225, 15, 1], requires_grad=1, device=cuda:0) = onnx::Mul(%653, %659) # /usr/local/lib/python3.8/dist-packages/timm/models/efficientnet_blocks.py:48:0
  %994 : Float(1, 80, 15, 15, strides=[18000, 225, 15, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[1, 1], pads=[0, 0, 0, 0], strides=[1, 1]](%660, %995, %996)
  %997 : Float(1, 480, 15, 15, strides=[108000, 225, 15, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[1, 1], pads=[0, 0, 0, 0], strides=[1, 1]](%994, %998, %999)
  %665 : Float(1, 480, 15, 15, strides=[108000, 225, 15, 1], device=cpu) = onnx::Sigmoid(%997)
  %666 : Float(1, 480, 15, 15, strides=[108000, 225, 15, 1], requires_grad=1, device=cuda:0) = onnx::Mul(%997, %665) # /usr/local/lib/python3.8/dist-packages/torch/nn/functional.py:1795:0
  %1000 : Float(1, 480, 15, 15, strides=[108000, 225, 15, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=480, kernel_shape=[3, 3], pads=[1, 1, 1, 1], strides=[1, 1]](%666, %1001, %1002)
  %669 : Float(1, 480, 15, 15, strides=[108000, 225, 15, 1], device=cpu) = onnx::Sigmoid(%1000)
  %670 : Float(1, 480, 15, 15, strides=[108000, 225, 15, 1], requires_grad=1, device=cuda:0) = onnx::Mul(%1000, %669) # /usr/local/lib/python3.8/dist-packages/torch/nn/functional.py:1795:0
  %671 : Float(1, 480, 1, 1, strides=[480, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::ReduceMean[axes=[2, 3], keepdims=1](%670) # /usr/local/lib/python3.8/dist-packages/timm/models/efficientnet_blocks.py:44:0
  %672 : Float(1, 20, 1, 1, strides=[20, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[1, 1], pads=[0, 0, 0, 0], strides=[1, 1]](%671, %blocks.3.1.se.conv_reduce.weight, %blocks.3.1.se.conv_reduce.bias) # /usr/local/lib/python3.8/dist-packages/torch/nn/modules/conv.py:395:0
  %673 : Float(1, 20, 1, 1, strides=[20, 1, 1, 1], device=cpu) = onnx::Sigmoid(%672)
  %674 : Float(1, 20, 1, 1, strides=[20, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::Mul(%672, %673) # /usr/local/lib/python3.8/dist-packages/torch/nn/functional.py:1795:0
  %675 : Float(1, 480, 1, 1, strides=[480, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[1, 1], pads=[0, 0, 0, 0], strides=[1, 1]](%674, %blocks.3.1.se.conv_expand.weight, %blocks.3.1.se.conv_expand.bias) # /usr/local/lib/python3.8/dist-packages/torch/nn/modules/conv.py:395:0
  %676 : Float(1, 480, 1, 1, strides=[480, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::Sigmoid(%675) # /usr/local/lib/python3.8/dist-packages/timm/models/layers/activations.py:47:0
  %677 : Float(1, 480, 15, 15, strides=[108000, 225, 15, 1], requires_grad=1, device=cuda:0) = onnx::Mul(%670, %676) # /usr/local/lib/python3.8/dist-packages/timm/models/efficientnet_blocks.py:48:0
  %1003 : Float(1, 80, 15, 15, strides=[18000, 225, 15, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[1, 1], pads=[0, 0, 0, 0], strides=[1, 1]](%677, %1004, %1005)
  %680 : Float(1, 80, 15, 15, strides=[18000, 225, 15, 1], requires_grad=1, device=cuda:0) = onnx::Add(%1003, %994)
  %1006 : Float(1, 480, 15, 15, strides=[108000, 225, 15, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[1, 1], pads=[0, 0, 0, 0], strides=[1, 1]](%680, %1007, %1008)
  %683 : Float(1, 480, 15, 15, strides=[108000, 225, 15, 1], device=cpu) = onnx::Sigmoid(%1006)
  %684 : Float(1, 480, 15, 15, strides=[108000, 225, 15, 1], requires_grad=1, device=cuda:0) = onnx::Mul(%1006, %683) # /usr/local/lib/python3.8/dist-packages/torch/nn/functional.py:1795:0
  %1009 : Float(1, 480, 15, 15, strides=[108000, 225, 15, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=480, kernel_shape=[3, 3], pads=[1, 1, 1, 1], strides=[1, 1]](%684, %1010, %1011)
  %687 : Float(1, 480, 15, 15, strides=[108000, 225, 15, 1], device=cpu) = onnx::Sigmoid(%1009)
  %688 : Float(1, 480, 15, 15, strides=[108000, 225, 15, 1], requires_grad=1, device=cuda:0) = onnx::Mul(%1009, %687) # /usr/local/lib/python3.8/dist-packages/torch/nn/functional.py:1795:0
  %689 : Float(1, 480, 1, 1, strides=[480, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::ReduceMean[axes=[2, 3], keepdims=1](%688) # /usr/local/lib/python3.8/dist-packages/timm/models/efficientnet_blocks.py:44:0
  %690 : Float(1, 20, 1, 1, strides=[20, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[1, 1], pads=[0, 0, 0, 0], strides=[1, 1]](%689, %blocks.3.2.se.conv_reduce.weight, %blocks.3.2.se.conv_reduce.bias) # /usr/local/lib/python3.8/dist-packages/torch/nn/modules/conv.py:395:0
  %691 : Float(1, 20, 1, 1, strides=[20, 1, 1, 1], device=cpu) = onnx::Sigmoid(%690)
  %692 : Float(1, 20, 1, 1, strides=[20, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::Mul(%690, %691) # /usr/local/lib/python3.8/dist-packages/torch/nn/functional.py:1795:0
  %693 : Float(1, 480, 1, 1, strides=[480, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[1, 1], pads=[0, 0, 0, 0], strides=[1, 1]](%692, %blocks.3.2.se.conv_expand.weight, %blocks.3.2.se.conv_expand.bias) # /usr/local/lib/python3.8/dist-packages/torch/nn/modules/conv.py:395:0
  %694 : Float(1, 480, 1, 1, strides=[480, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::Sigmoid(%693) # /usr/local/lib/python3.8/dist-packages/timm/models/layers/activations.py:47:0
  %695 : Float(1, 480, 15, 15, strides=[108000, 225, 15, 1], requires_grad=1, device=cuda:0) = onnx::Mul(%688, %694) # /usr/local/lib/python3.8/dist-packages/timm/models/efficientnet_blocks.py:48:0
  %1012 : Float(1, 80, 15, 15, strides=[18000, 225, 15, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[1, 1], pads=[0, 0, 0, 0], strides=[1, 1]](%695, %1013, %1014)
  %698 : Float(1, 80, 15, 15, strides=[18000, 225, 15, 1], requires_grad=1, device=cuda:0) = onnx::Add(%1012, %680)
  %1015 : Float(1, 480, 15, 15, strides=[108000, 225, 15, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[1, 1], pads=[0, 0, 0, 0], strides=[1, 1]](%698, %1016, %1017)
  %701 : Float(1, 480, 15, 15, strides=[108000, 225, 15, 1], device=cpu) = onnx::Sigmoid(%1015)
  %702 : Float(1, 480, 15, 15, strides=[108000, 225, 15, 1], requires_grad=1, device=cuda:0) = onnx::Mul(%1015, %701) # /usr/local/lib/python3.8/dist-packages/torch/nn/functional.py:1795:0
  %1018 : Float(1, 480, 15, 15, strides=[108000, 225, 15, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=480, kernel_shape=[3, 3], pads=[1, 1, 1, 1], strides=[1, 1]](%702, %1019, %1020)
  %705 : Float(1, 480, 15, 15, strides=[108000, 225, 15, 1], device=cpu) = onnx::Sigmoid(%1018)
  %706 : Float(1, 480, 15, 15, strides=[108000, 225, 15, 1], requires_grad=1, device=cuda:0) = onnx::Mul(%1018, %705) # /usr/local/lib/python3.8/dist-packages/torch/nn/functional.py:1795:0
  %707 : Float(1, 480, 1, 1, strides=[480, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::ReduceMean[axes=[2, 3], keepdims=1](%706) # /usr/local/lib/python3.8/dist-packages/timm/models/efficientnet_blocks.py:44:0
  %708 : Float(1, 20, 1, 1, strides=[20, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[1, 1], pads=[0, 0, 0, 0], strides=[1, 1]](%707, %blocks.3.3.se.conv_reduce.weight, %blocks.3.3.se.conv_reduce.bias) # /usr/local/lib/python3.8/dist-packages/torch/nn/modules/conv.py:395:0
  %709 : Float(1, 20, 1, 1, strides=[20, 1, 1, 1], device=cpu) = onnx::Sigmoid(%708)
  %710 : Float(1, 20, 1, 1, strides=[20, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::Mul(%708, %709) # /usr/local/lib/python3.8/dist-packages/torch/nn/functional.py:1795:0
  %711 : Float(1, 480, 1, 1, strides=[480, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[1, 1], pads=[0, 0, 0, 0], strides=[1, 1]](%710, %blocks.3.3.se.conv_expand.weight, %blocks.3.3.se.conv_expand.bias) # /usr/local/lib/python3.8/dist-packages/torch/nn/modules/conv.py:395:0
  %712 : Float(1, 480, 1, 1, strides=[480, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::Sigmoid(%711) # /usr/local/lib/python3.8/dist-packages/timm/models/layers/activations.py:47:0
  %713 : Float(1, 480, 15, 15, strides=[108000, 225, 15, 1], requires_grad=1, device=cuda:0) = onnx::Mul(%706, %712) # /usr/local/lib/python3.8/dist-packages/timm/models/efficientnet_blocks.py:48:0
  %1021 : Float(1, 80, 15, 15, strides=[18000, 225, 15, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[1, 1], pads=[0, 0, 0, 0], strides=[1, 1]](%713, %1022, %1023)
  %716 : Float(1, 80, 15, 15, strides=[18000, 225, 15, 1], requires_grad=1, device=cuda:0) = onnx::Add(%1021, %698) # /usr/local/lib/python3.8/dist-packages/timm/models/efficientnet_blocks.py:208:0
  %1024 : Float(1, 480, 15, 15, strides=[108000, 225, 15, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[1, 1], pads=[0, 0, 0, 0], strides=[1, 1]](%716, %1025, %1026)
  %719 : Float(1, 480, 15, 15, strides=[108000, 225, 15, 1], device=cpu) = onnx::Sigmoid(%1024)
  %720 : Float(1, 480, 15, 15, strides=[108000, 225, 15, 1], requires_grad=1, device=cuda:0) = onnx::Mul(%1024, %719) # /usr/local/lib/python3.8/dist-packages/torch/nn/functional.py:1795:0
  %1027 : Float(1, 480, 15, 15, strides=[108000, 225, 15, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=480, kernel_shape=[5, 5], pads=[2, 2, 2, 2], strides=[1, 1]](%720, %1028, %1029)
  %723 : Float(1, 480, 15, 15, strides=[108000, 225, 15, 1], device=cpu) = onnx::Sigmoid(%1027)
  %724 : Float(1, 480, 15, 15, strides=[108000, 225, 15, 1], requires_grad=1, device=cuda:0) = onnx::Mul(%1027, %723) # /usr/local/lib/python3.8/dist-packages/torch/nn/functional.py:1795:0
  %725 : Float(1, 480, 1, 1, strides=[480, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::ReduceMean[axes=[2, 3], keepdims=1](%724) # /usr/local/lib/python3.8/dist-packages/timm/models/efficientnet_blocks.py:44:0
  %726 : Float(1, 20, 1, 1, strides=[20, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[1, 1], pads=[0, 0, 0, 0], strides=[1, 1]](%725, %blocks.4.0.se.conv_reduce.weight, %blocks.4.0.se.conv_reduce.bias) # /usr/local/lib/python3.8/dist-packages/torch/nn/modules/conv.py:395:0
  %727 : Float(1, 20, 1, 1, strides=[20, 1, 1, 1], device=cpu) = onnx::Sigmoid(%726)
  %728 : Float(1, 20, 1, 1, strides=[20, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::Mul(%726, %727) # /usr/local/lib/python3.8/dist-packages/torch/nn/functional.py:1795:0
  %729 : Float(1, 480, 1, 1, strides=[480, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[1, 1], pads=[0, 0, 0, 0], strides=[1, 1]](%728, %blocks.4.0.se.conv_expand.weight, %blocks.4.0.se.conv_expand.bias) # /usr/local/lib/python3.8/dist-packages/torch/nn/modules/conv.py:395:0
  %730 : Float(1, 480, 1, 1, strides=[480, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::Sigmoid(%729) # /usr/local/lib/python3.8/dist-packages/timm/models/layers/activations.py:47:0
  %731 : Float(1, 480, 15, 15, strides=[108000, 225, 15, 1], requires_grad=1, device=cuda:0) = onnx::Mul(%724, %730) # /usr/local/lib/python3.8/dist-packages/timm/models/efficientnet_blocks.py:48:0
  %1030 : Float(1, 112, 15, 15, strides=[25200, 225, 15, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[1, 1], pads=[0, 0, 0, 0], strides=[1, 1]](%731, %1031, %1032)
  %1033 : Float(1, 672, 15, 15, strides=[151200, 225, 15, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[1, 1], pads=[0, 0, 0, 0], strides=[1, 1]](%1030, %1034, %1035)
  %736 : Float(1, 672, 15, 15, strides=[151200, 225, 15, 1], device=cpu) = onnx::Sigmoid(%1033)
  %737 : Float(1, 672, 15, 15, strides=[151200, 225, 15, 1], requires_grad=1, device=cuda:0) = onnx::Mul(%1033, %736) # /usr/local/lib/python3.8/dist-packages/torch/nn/functional.py:1795:0
  %1036 : Float(1, 672, 15, 15, strides=[151200, 225, 15, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=672, kernel_shape=[5, 5], pads=[2, 2, 2, 2], strides=[1, 1]](%737, %1037, %1038)
  %740 : Float(1, 672, 15, 15, strides=[151200, 225, 15, 1], device=cpu) = onnx::Sigmoid(%1036)
  %741 : Float(1, 672, 15, 15, strides=[151200, 225, 15, 1], requires_grad=1, device=cuda:0) = onnx::Mul(%1036, %740) # /usr/local/lib/python3.8/dist-packages/torch/nn/functional.py:1795:0
  %742 : Float(1, 672, 1, 1, strides=[672, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::ReduceMean[axes=[2, 3], keepdims=1](%741) # /usr/local/lib/python3.8/dist-packages/timm/models/efficientnet_blocks.py:44:0
  %743 : Float(1, 28, 1, 1, strides=[28, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[1, 1], pads=[0, 0, 0, 0], strides=[1, 1]](%742, %blocks.4.1.se.conv_reduce.weight, %blocks.4.1.se.conv_reduce.bias) # /usr/local/lib/python3.8/dist-packages/torch/nn/modules/conv.py:395:0
  %744 : Float(1, 28, 1, 1, strides=[28, 1, 1, 1], device=cpu) = onnx::Sigmoid(%743)
  %745 : Float(1, 28, 1, 1, strides=[28, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::Mul(%743, %744) # /usr/local/lib/python3.8/dist-packages/torch/nn/functional.py:1795:0
  %746 : Float(1, 672, 1, 1, strides=[672, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[1, 1], pads=[0, 0, 0, 0], strides=[1, 1]](%745, %blocks.4.1.se.conv_expand.weight, %blocks.4.1.se.conv_expand.bias) # /usr/local/lib/python3.8/dist-packages/torch/nn/modules/conv.py:395:0
  %747 : Float(1, 672, 1, 1, strides=[672, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::Sigmoid(%746) # /usr/local/lib/python3.8/dist-packages/timm/models/layers/activations.py:47:0
  %748 : Float(1, 672, 15, 15, strides=[151200, 225, 15, 1], requires_grad=1, device=cuda:0) = onnx::Mul(%741, %747) # /usr/local/lib/python3.8/dist-packages/timm/models/efficientnet_blocks.py:48:0
  %1039 : Float(1, 112, 15, 15, strides=[25200, 225, 15, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[1, 1], pads=[0, 0, 0, 0], strides=[1, 1]](%748, %1040, %1041)
  %751 : Float(1, 112, 15, 15, strides=[25200, 225, 15, 1], requires_grad=1, device=cuda:0) = onnx::Add(%1039, %1030)
  %1042 : Float(1, 672, 15, 15, strides=[151200, 225, 15, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[1, 1], pads=[0, 0, 0, 0], strides=[1, 1]](%751, %1043, %1044)
  %754 : Float(1, 672, 15, 15, strides=[151200, 225, 15, 1], device=cpu) = onnx::Sigmoid(%1042)
  %755 : Float(1, 672, 15, 15, strides=[151200, 225, 15, 1], requires_grad=1, device=cuda:0) = onnx::Mul(%1042, %754) # /usr/local/lib/python3.8/dist-packages/torch/nn/functional.py:1795:0
  %1045 : Float(1, 672, 15, 15, strides=[151200, 225, 15, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=672, kernel_shape=[5, 5], pads=[2, 2, 2, 2], strides=[1, 1]](%755, %1046, %1047)
  %758 : Float(1, 672, 15, 15, strides=[151200, 225, 15, 1], device=cpu) = onnx::Sigmoid(%1045)
  %759 : Float(1, 672, 15, 15, strides=[151200, 225, 15, 1], requires_grad=1, device=cuda:0) = onnx::Mul(%1045, %758) # /usr/local/lib/python3.8/dist-packages/torch/nn/functional.py:1795:0
  %760 : Float(1, 672, 1, 1, strides=[672, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::ReduceMean[axes=[2, 3], keepdims=1](%759) # /usr/local/lib/python3.8/dist-packages/timm/models/efficientnet_blocks.py:44:0
  %761 : Float(1, 28, 1, 1, strides=[28, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[1, 1], pads=[0, 0, 0, 0], strides=[1, 1]](%760, %blocks.4.2.se.conv_reduce.weight, %blocks.4.2.se.conv_reduce.bias) # /usr/local/lib/python3.8/dist-packages/torch/nn/modules/conv.py:395:0
  %762 : Float(1, 28, 1, 1, strides=[28, 1, 1, 1], device=cpu) = onnx::Sigmoid(%761)
  %763 : Float(1, 28, 1, 1, strides=[28, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::Mul(%761, %762) # /usr/local/lib/python3.8/dist-packages/torch/nn/functional.py:1795:0
  %764 : Float(1, 672, 1, 1, strides=[672, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[1, 1], pads=[0, 0, 0, 0], strides=[1, 1]](%763, %blocks.4.2.se.conv_expand.weight, %blocks.4.2.se.conv_expand.bias) # /usr/local/lib/python3.8/dist-packages/torch/nn/modules/conv.py:395:0
  %765 : Float(1, 672, 1, 1, strides=[672, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::Sigmoid(%764) # /usr/local/lib/python3.8/dist-packages/timm/models/layers/activations.py:47:0
  %766 : Float(1, 672, 15, 15, strides=[151200, 225, 15, 1], requires_grad=1, device=cuda:0) = onnx::Mul(%759, %765) # /usr/local/lib/python3.8/dist-packages/timm/models/efficientnet_blocks.py:48:0
  %1048 : Float(1, 112, 15, 15, strides=[25200, 225, 15, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[1, 1], pads=[0, 0, 0, 0], strides=[1, 1]](%766, %1049, %1050)
  %769 : Float(1, 112, 15, 15, strides=[25200, 225, 15, 1], requires_grad=1, device=cuda:0) = onnx::Add(%1048, %751)
  %1051 : Float(1, 672, 15, 15, strides=[151200, 225, 15, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[1, 1], pads=[0, 0, 0, 0], strides=[1, 1]](%769, %1052, %1053)
  %772 : Float(1, 672, 15, 15, strides=[151200, 225, 15, 1], device=cpu) = onnx::Sigmoid(%1051)
  %773 : Float(1, 672, 15, 15, strides=[151200, 225, 15, 1], requires_grad=1, device=cuda:0) = onnx::Mul(%1051, %772) # /usr/local/lib/python3.8/dist-packages/torch/nn/functional.py:1795:0
  %1054 : Float(1, 672, 15, 15, strides=[151200, 225, 15, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=672, kernel_shape=[5, 5], pads=[2, 2, 2, 2], strides=[1, 1]](%773, %1055, %1056)
  %776 : Float(1, 672, 15, 15, strides=[151200, 225, 15, 1], device=cpu) = onnx::Sigmoid(%1054)
  %777 : Float(1, 672, 15, 15, strides=[151200, 225, 15, 1], requires_grad=1, device=cuda:0) = onnx::Mul(%1054, %776) # /usr/local/lib/python3.8/dist-packages/torch/nn/functional.py:1795:0
  %778 : Float(1, 672, 1, 1, strides=[672, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::ReduceMean[axes=[2, 3], keepdims=1](%777) # /usr/local/lib/python3.8/dist-packages/timm/models/efficientnet_blocks.py:44:0
  %779 : Float(1, 28, 1, 1, strides=[28, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[1, 1], pads=[0, 0, 0, 0], strides=[1, 1]](%778, %blocks.4.3.se.conv_reduce.weight, %blocks.4.3.se.conv_reduce.bias) # /usr/local/lib/python3.8/dist-packages/torch/nn/modules/conv.py:395:0
  %780 : Float(1, 28, 1, 1, strides=[28, 1, 1, 1], device=cpu) = onnx::Sigmoid(%779)
  %781 : Float(1, 28, 1, 1, strides=[28, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::Mul(%779, %780) # /usr/local/lib/python3.8/dist-packages/torch/nn/functional.py:1795:0
  %782 : Float(1, 672, 1, 1, strides=[672, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[1, 1], pads=[0, 0, 0, 0], strides=[1, 1]](%781, %blocks.4.3.se.conv_expand.weight, %blocks.4.3.se.conv_expand.bias) # /usr/local/lib/python3.8/dist-packages/torch/nn/modules/conv.py:395:0
  %783 : Float(1, 672, 1, 1, strides=[672, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::Sigmoid(%782) # /usr/local/lib/python3.8/dist-packages/timm/models/layers/activations.py:47:0
  %784 : Float(1, 672, 15, 15, strides=[151200, 225, 15, 1], requires_grad=1, device=cuda:0) = onnx::Mul(%777, %783) # /usr/local/lib/python3.8/dist-packages/timm/models/efficientnet_blocks.py:48:0
  %1057 : Float(1, 112, 15, 15, strides=[25200, 225, 15, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[1, 1], pads=[0, 0, 0, 0], strides=[1, 1]](%784, %1058, %1059)
  %787 : Float(1, 112, 15, 15, strides=[25200, 225, 15, 1], requires_grad=1, device=cuda:0) = onnx::Add(%1057, %769) # /usr/local/lib/python3.8/dist-packages/timm/models/efficientnet_blocks.py:208:0
  %1060 : Float(1, 672, 15, 15, strides=[151200, 225, 15, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[1, 1], pads=[0, 0, 0, 0], strides=[1, 1]](%787, %1061, %1062)
  %790 : Float(1, 672, 15, 15, strides=[151200, 225, 15, 1], device=cpu) = onnx::Sigmoid(%1060)
  %791 : Float(1, 672, 15, 15, strides=[151200, 225, 15, 1], requires_grad=1, device=cuda:0) = onnx::Mul(%1060, %790) # /usr/local/lib/python3.8/dist-packages/torch/nn/functional.py:1795:0
  %1063 : Float(1, 672, 8, 8, strides=[43008, 64, 8, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=672, kernel_shape=[5, 5], pads=[2, 2, 2, 2], strides=[2, 2]](%791, %1064, %1065)
  %794 : Float(1, 672, 8, 8, strides=[43008, 64, 8, 1], device=cpu) = onnx::Sigmoid(%1063)
  %795 : Float(1, 672, 8, 8, strides=[43008, 64, 8, 1], requires_grad=1, device=cuda:0) = onnx::Mul(%1063, %794) # /usr/local/lib/python3.8/dist-packages/torch/nn/functional.py:1795:0
  %796 : Float(1, 672, 1, 1, strides=[672, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::ReduceMean[axes=[2, 3], keepdims=1](%795) # /usr/local/lib/python3.8/dist-packages/timm/models/efficientnet_blocks.py:44:0
  %797 : Float(1, 28, 1, 1, strides=[28, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[1, 1], pads=[0, 0, 0, 0], strides=[1, 1]](%796, %blocks.5.0.se.conv_reduce.weight, %blocks.5.0.se.conv_reduce.bias) # /usr/local/lib/python3.8/dist-packages/torch/nn/modules/conv.py:395:0
  %798 : Float(1, 28, 1, 1, strides=[28, 1, 1, 1], device=cpu) = onnx::Sigmoid(%797)
  %799 : Float(1, 28, 1, 1, strides=[28, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::Mul(%797, %798) # /usr/local/lib/python3.8/dist-packages/torch/nn/functional.py:1795:0
  %800 : Float(1, 672, 1, 1, strides=[672, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[1, 1], pads=[0, 0, 0, 0], strides=[1, 1]](%799, %blocks.5.0.se.conv_expand.weight, %blocks.5.0.se.conv_expand.bias) # /usr/local/lib/python3.8/dist-packages/torch/nn/modules/conv.py:395:0
  %801 : Float(1, 672, 1, 1, strides=[672, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::Sigmoid(%800) # /usr/local/lib/python3.8/dist-packages/timm/models/layers/activations.py:47:0
  %802 : Float(1, 672, 8, 8, strides=[43008, 64, 8, 1], requires_grad=1, device=cuda:0) = onnx::Mul(%795, %801) # /usr/local/lib/python3.8/dist-packages/timm/models/efficientnet_blocks.py:48:0
  %1066 : Float(1, 192, 8, 8, strides=[12288, 64, 8, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[1, 1], pads=[0, 0, 0, 0], strides=[1, 1]](%802, %1067, %1068)
  %1069 : Float(1, 1152, 8, 8, strides=[73728, 64, 8, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[1, 1], pads=[0, 0, 0, 0], strides=[1, 1]](%1066, %1070, %1071)
  %807 : Float(1, 1152, 8, 8, strides=[73728, 64, 8, 1], device=cpu) = onnx::Sigmoid(%1069)
  %808 : Float(1, 1152, 8, 8, strides=[73728, 64, 8, 1], requires_grad=1, device=cuda:0) = onnx::Mul(%1069, %807) # /usr/local/lib/python3.8/dist-packages/torch/nn/functional.py:1795:0
  %1072 : Float(1, 1152, 8, 8, strides=[73728, 64, 8, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=1152, kernel_shape=[5, 5], pads=[2, 2, 2, 2], strides=[1, 1]](%808, %1073, %1074)
  %811 : Float(1, 1152, 8, 8, strides=[73728, 64, 8, 1], device=cpu) = onnx::Sigmoid(%1072)
  %812 : Float(1, 1152, 8, 8, strides=[73728, 64, 8, 1], requires_grad=1, device=cuda:0) = onnx::Mul(%1072, %811) # /usr/local/lib/python3.8/dist-packages/torch/nn/functional.py:1795:0
  %813 : Float(1, 1152, 1, 1, strides=[1152, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::ReduceMean[axes=[2, 3], keepdims=1](%812) # /usr/local/lib/python3.8/dist-packages/timm/models/efficientnet_blocks.py:44:0
  %814 : Float(1, 48, 1, 1, strides=[48, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[1, 1], pads=[0, 0, 0, 0], strides=[1, 1]](%813, %blocks.5.1.se.conv_reduce.weight, %blocks.5.1.se.conv_reduce.bias) # /usr/local/lib/python3.8/dist-packages/torch/nn/modules/conv.py:395:0
  %815 : Float(1, 48, 1, 1, strides=[48, 1, 1, 1], device=cpu) = onnx::Sigmoid(%814)
  %816 : Float(1, 48, 1, 1, strides=[48, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::Mul(%814, %815) # /usr/local/lib/python3.8/dist-packages/torch/nn/functional.py:1795:0
  %817 : Float(1, 1152, 1, 1, strides=[1152, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[1, 1], pads=[0, 0, 0, 0], strides=[1, 1]](%816, %blocks.5.1.se.conv_expand.weight, %blocks.5.1.se.conv_expand.bias) # /usr/local/lib/python3.8/dist-packages/torch/nn/modules/conv.py:395:0
  %818 : Float(1, 1152, 1, 1, strides=[1152, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::Sigmoid(%817) # /usr/local/lib/python3.8/dist-packages/timm/models/layers/activations.py:47:0
  %819 : Float(1, 1152, 8, 8, strides=[73728, 64, 8, 1], requires_grad=1, device=cuda:0) = onnx::Mul(%812, %818) # /usr/local/lib/python3.8/dist-packages/timm/models/efficientnet_blocks.py:48:0
  %1075 : Float(1, 192, 8, 8, strides=[12288, 64, 8, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[1, 1], pads=[0, 0, 0, 0], strides=[1, 1]](%819, %1076, %1077)
  %822 : Float(1, 192, 8, 8, strides=[12288, 64, 8, 1], requires_grad=1, device=cuda:0) = onnx::Add(%1075, %1066)
  %1078 : Float(1, 1152, 8, 8, strides=[73728, 64, 8, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[1, 1], pads=[0, 0, 0, 0], strides=[1, 1]](%822, %1079, %1080)
  %825 : Float(1, 1152, 8, 8, strides=[73728, 64, 8, 1], device=cpu) = onnx::Sigmoid(%1078)
  %826 : Float(1, 1152, 8, 8, strides=[73728, 64, 8, 1], requires_grad=1, device=cuda:0) = onnx::Mul(%1078, %825) # /usr/local/lib/python3.8/dist-packages/torch/nn/functional.py:1795:0
  %1081 : Float(1, 1152, 8, 8, strides=[73728, 64, 8, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=1152, kernel_shape=[5, 5], pads=[2, 2, 2, 2], strides=[1, 1]](%826, %1082, %1083)
  %829 : Float(1, 1152, 8, 8, strides=[73728, 64, 8, 1], device=cpu) = onnx::Sigmoid(%1081)
  %830 : Float(1, 1152, 8, 8, strides=[73728, 64, 8, 1], requires_grad=1, device=cuda:0) = onnx::Mul(%1081, %829) # /usr/local/lib/python3.8/dist-packages/torch/nn/functional.py:1795:0
  %831 : Float(1, 1152, 1, 1, strides=[1152, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::ReduceMean[axes=[2, 3], keepdims=1](%830) # /usr/local/lib/python3.8/dist-packages/timm/models/efficientnet_blocks.py:44:0
  %832 : Float(1, 48, 1, 1, strides=[48, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[1, 1], pads=[0, 0, 0, 0], strides=[1, 1]](%831, %blocks.5.2.se.conv_reduce.weight, %blocks.5.2.se.conv_reduce.bias) # /usr/local/lib/python3.8/dist-packages/torch/nn/modules/conv.py:395:0
  %833 : Float(1, 48, 1, 1, strides=[48, 1, 1, 1], device=cpu) = onnx::Sigmoid(%832)
  %834 : Float(1, 48, 1, 1, strides=[48, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::Mul(%832, %833) # /usr/local/lib/python3.8/dist-packages/torch/nn/functional.py:1795:0
  %835 : Float(1, 1152, 1, 1, strides=[1152, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[1, 1], pads=[0, 0, 0, 0], strides=[1, 1]](%834, %blocks.5.2.se.conv_expand.weight, %blocks.5.2.se.conv_expand.bias) # /usr/local/lib/python3.8/dist-packages/torch/nn/modules/conv.py:395:0
  %836 : Float(1, 1152, 1, 1, strides=[1152, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::Sigmoid(%835) # /usr/local/lib/python3.8/dist-packages/timm/models/layers/activations.py:47:0
  %837 : Float(1, 1152, 8, 8, strides=[73728, 64, 8, 1], requires_grad=1, device=cuda:0) = onnx::Mul(%830, %836) # /usr/local/lib/python3.8/dist-packages/timm/models/efficientnet_blocks.py:48:0
  %1084 : Float(1, 192, 8, 8, strides=[12288, 64, 8, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[1, 1], pads=[0, 0, 0, 0], strides=[1, 1]](%837, %1085, %1086)
  %840 : Float(1, 192, 8, 8, strides=[12288, 64, 8, 1], requires_grad=1, device=cuda:0) = onnx::Add(%1084, %822)
  %1087 : Float(1, 1152, 8, 8, strides=[73728, 64, 8, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[1, 1], pads=[0, 0, 0, 0], strides=[1, 1]](%840, %1088, %1089)
  %843 : Float(1, 1152, 8, 8, strides=[73728, 64, 8, 1], device=cpu) = onnx::Sigmoid(%1087)
  %844 : Float(1, 1152, 8, 8, strides=[73728, 64, 8, 1], requires_grad=1, device=cuda:0) = onnx::Mul(%1087, %843) # /usr/local/lib/python3.8/dist-packages/torch/nn/functional.py:1795:0
  %1090 : Float(1, 1152, 8, 8, strides=[73728, 64, 8, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=1152, kernel_shape=[5, 5], pads=[2, 2, 2, 2], strides=[1, 1]](%844, %1091, %1092)
  %847 : Float(1, 1152, 8, 8, strides=[73728, 64, 8, 1], device=cpu) = onnx::Sigmoid(%1090)
  %848 : Float(1, 1152, 8, 8, strides=[73728, 64, 8, 1], requires_grad=1, device=cuda:0) = onnx::Mul(%1090, %847) # /usr/local/lib/python3.8/dist-packages/torch/nn/functional.py:1795:0
  %849 : Float(1, 1152, 1, 1, strides=[1152, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::ReduceMean[axes=[2, 3], keepdims=1](%848) # /usr/local/lib/python3.8/dist-packages/timm/models/efficientnet_blocks.py:44:0
  %850 : Float(1, 48, 1, 1, strides=[48, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[1, 1], pads=[0, 0, 0, 0], strides=[1, 1]](%849, %blocks.5.3.se.conv_reduce.weight, %blocks.5.3.se.conv_reduce.bias) # /usr/local/lib/python3.8/dist-packages/torch/nn/modules/conv.py:395:0
  %851 : Float(1, 48, 1, 1, strides=[48, 1, 1, 1], device=cpu) = onnx::Sigmoid(%850)
  %852 : Float(1, 48, 1, 1, strides=[48, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::Mul(%850, %851) # /usr/local/lib/python3.8/dist-packages/torch/nn/functional.py:1795:0
  %853 : Float(1, 1152, 1, 1, strides=[1152, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[1, 1], pads=[0, 0, 0, 0], strides=[1, 1]](%852, %blocks.5.3.se.conv_expand.weight, %blocks.5.3.se.conv_expand.bias) # /usr/local/lib/python3.8/dist-packages/torch/nn/modules/conv.py:395:0
  %854 : Float(1, 1152, 1, 1, strides=[1152, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::Sigmoid(%853) # /usr/local/lib/python3.8/dist-packages/timm/models/layers/activations.py:47:0
  %855 : Float(1, 1152, 8, 8, strides=[73728, 64, 8, 1], requires_grad=1, device=cuda:0) = onnx::Mul(%848, %854) # /usr/local/lib/python3.8/dist-packages/timm/models/efficientnet_blocks.py:48:0
  %1093 : Float(1, 192, 8, 8, strides=[12288, 64, 8, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[1, 1], pads=[0, 0, 0, 0], strides=[1, 1]](%855, %1094, %1095)
  %858 : Float(1, 192, 8, 8, strides=[12288, 64, 8, 1], requires_grad=1, device=cuda:0) = onnx::Add(%1093, %840)
  %1096 : Float(1, 1152, 8, 8, strides=[73728, 64, 8, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[1, 1], pads=[0, 0, 0, 0], strides=[1, 1]](%858, %1097, %1098)
  %861 : Float(1, 1152, 8, 8, strides=[73728, 64, 8, 1], device=cpu) = onnx::Sigmoid(%1096)
  %862 : Float(1, 1152, 8, 8, strides=[73728, 64, 8, 1], requires_grad=1, device=cuda:0) = onnx::Mul(%1096, %861) # /usr/local/lib/python3.8/dist-packages/torch/nn/functional.py:1795:0
  %1099 : Float(1, 1152, 8, 8, strides=[73728, 64, 8, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=1152, kernel_shape=[5, 5], pads=[2, 2, 2, 2], strides=[1, 1]](%862, %1100, %1101)
  %865 : Float(1, 1152, 8, 8, strides=[73728, 64, 8, 1], device=cpu) = onnx::Sigmoid(%1099)
  %866 : Float(1, 1152, 8, 8, strides=[73728, 64, 8, 1], requires_grad=1, device=cuda:0) = onnx::Mul(%1099, %865) # /usr/local/lib/python3.8/dist-packages/torch/nn/functional.py:1795:0
  %867 : Float(1, 1152, 1, 1, strides=[1152, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::ReduceMean[axes=[2, 3], keepdims=1](%866) # /usr/local/lib/python3.8/dist-packages/timm/models/efficientnet_blocks.py:44:0
  %868 : Float(1, 48, 1, 1, strides=[48, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[1, 1], pads=[0, 0, 0, 0], strides=[1, 1]](%867, %blocks.5.4.se.conv_reduce.weight, %blocks.5.4.se.conv_reduce.bias) # /usr/local/lib/python3.8/dist-packages/torch/nn/modules/conv.py:395:0
  %869 : Float(1, 48, 1, 1, strides=[48, 1, 1, 1], device=cpu) = onnx::Sigmoid(%868)
  %870 : Float(1, 48, 1, 1, strides=[48, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::Mul(%868, %869) # /usr/local/lib/python3.8/dist-packages/torch/nn/functional.py:1795:0
  %871 : Float(1, 1152, 1, 1, strides=[1152, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[1, 1], pads=[0, 0, 0, 0], strides=[1, 1]](%870, %blocks.5.4.se.conv_expand.weight, %blocks.5.4.se.conv_expand.bias) # /usr/local/lib/python3.8/dist-packages/torch/nn/modules/conv.py:395:0
  %872 : Float(1, 1152, 1, 1, strides=[1152, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::Sigmoid(%871) # /usr/local/lib/python3.8/dist-packages/timm/models/layers/activations.py:47:0
  %873 : Float(1, 1152, 8, 8, strides=[73728, 64, 8, 1], requires_grad=1, device=cuda:0) = onnx::Mul(%866, %872) # /usr/local/lib/python3.8/dist-packages/timm/models/efficientnet_blocks.py:48:0
  %1102 : Float(1, 192, 8, 8, strides=[12288, 64, 8, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[1, 1], pads=[0, 0, 0, 0], strides=[1, 1]](%873, %1103, %1104)
  %876 : Float(1, 192, 8, 8, strides=[12288, 64, 8, 1], requires_grad=1, device=cuda:0) = onnx::Add(%1102, %858) # /usr/local/lib/python3.8/dist-packages/timm/models/efficientnet_blocks.py:208:0
  %1105 : Float(1, 1152, 8, 8, strides=[73728, 64, 8, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[1, 1], pads=[0, 0, 0, 0], strides=[1, 1]](%876, %1106, %1107)
  %879 : Float(1, 1152, 8, 8, strides=[73728, 64, 8, 1], device=cpu) = onnx::Sigmoid(%1105)
  %880 : Float(1, 1152, 8, 8, strides=[73728, 64, 8, 1], requires_grad=1, device=cuda:0) = onnx::Mul(%1105, %879) # /usr/local/lib/python3.8/dist-packages/torch/nn/functional.py:1795:0
  %1108 : Float(1, 1152, 8, 8, strides=[73728, 64, 8, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=1152, kernel_shape=[3, 3], pads=[1, 1, 1, 1], strides=[1, 1]](%880, %1109, %1110)
  %883 : Float(1, 1152, 8, 8, strides=[73728, 64, 8, 1], device=cpu) = onnx::Sigmoid(%1108)
  %884 : Float(1, 1152, 8, 8, strides=[73728, 64, 8, 1], requires_grad=1, device=cuda:0) = onnx::Mul(%1108, %883) # /usr/local/lib/python3.8/dist-packages/torch/nn/functional.py:1795:0
  %885 : Float(1, 1152, 1, 1, strides=[1152, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::ReduceMean[axes=[2, 3], keepdims=1](%884) # /usr/local/lib/python3.8/dist-packages/timm/models/efficientnet_blocks.py:44:0
  %886 : Float(1, 48, 1, 1, strides=[48, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[1, 1], pads=[0, 0, 0, 0], strides=[1, 1]](%885, %blocks.6.0.se.conv_reduce.weight, %blocks.6.0.se.conv_reduce.bias) # /usr/local/lib/python3.8/dist-packages/torch/nn/modules/conv.py:395:0
  %887 : Float(1, 48, 1, 1, strides=[48, 1, 1, 1], device=cpu) = onnx::Sigmoid(%886)
  %888 : Float(1, 48, 1, 1, strides=[48, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::Mul(%886, %887) # /usr/local/lib/python3.8/dist-packages/torch/nn/functional.py:1795:0
  %889 : Float(1, 1152, 1, 1, strides=[1152, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[1, 1], pads=[0, 0, 0, 0], strides=[1, 1]](%888, %blocks.6.0.se.conv_expand.weight, %blocks.6.0.se.conv_expand.bias) # /usr/local/lib/python3.8/dist-packages/torch/nn/modules/conv.py:395:0
  %890 : Float(1, 1152, 1, 1, strides=[1152, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::Sigmoid(%889) # /usr/local/lib/python3.8/dist-packages/timm/models/layers/activations.py:47:0
  %891 : Float(1, 1152, 8, 8, strides=[73728, 64, 8, 1], requires_grad=1, device=cuda:0) = onnx::Mul(%884, %890) # /usr/local/lib/python3.8/dist-packages/timm/models/efficientnet_blocks.py:48:0
  %1111 : Float(1, 320, 8, 8, strides=[20480, 64, 8, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[1, 1], pads=[0, 0, 0, 0], strides=[1, 1]](%891, %1112, %1113)
  %1114 : Float(1, 1920, 8, 8, strides=[122880, 64, 8, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[1, 1], pads=[0, 0, 0, 0], strides=[1, 1]](%1111, %1115, %1116)
  %896 : Float(1, 1920, 8, 8, strides=[122880, 64, 8, 1], device=cpu) = onnx::Sigmoid(%1114)
  %897 : Float(1, 1920, 8, 8, strides=[122880, 64, 8, 1], requires_grad=1, device=cuda:0) = onnx::Mul(%1114, %896) # /usr/local/lib/python3.8/dist-packages/torch/nn/functional.py:1795:0
  %1117 : Float(1, 1920, 8, 8, strides=[122880, 64, 8, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=1920, kernel_shape=[3, 3], pads=[1, 1, 1, 1], strides=[1, 1]](%897, %1118, %1119)
  %900 : Float(1, 1920, 8, 8, strides=[122880, 64, 8, 1], device=cpu) = onnx::Sigmoid(%1117)
  %901 : Float(1, 1920, 8, 8, strides=[122880, 64, 8, 1], requires_grad=1, device=cuda:0) = onnx::Mul(%1117, %900) # /usr/local/lib/python3.8/dist-packages/torch/nn/functional.py:1795:0
  %902 : Float(1, 1920, 1, 1, strides=[1920, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::ReduceMean[axes=[2, 3], keepdims=1](%901) # /usr/local/lib/python3.8/dist-packages/timm/models/efficientnet_blocks.py:44:0
  %903 : Float(1, 80, 1, 1, strides=[80, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[1, 1], pads=[0, 0, 0, 0], strides=[1, 1]](%902, %blocks.6.1.se.conv_reduce.weight, %blocks.6.1.se.conv_reduce.bias) # /usr/local/lib/python3.8/dist-packages/torch/nn/modules/conv.py:395:0
  %904 : Float(1, 80, 1, 1, strides=[80, 1, 1, 1], device=cpu) = onnx::Sigmoid(%903)
  %905 : Float(1, 80, 1, 1, strides=[80, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::Mul(%903, %904) # /usr/local/lib/python3.8/dist-packages/torch/nn/functional.py:1795:0
  %906 : Float(1, 1920, 1, 1, strides=[1920, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[1, 1], pads=[0, 0, 0, 0], strides=[1, 1]](%905, %blocks.6.1.se.conv_expand.weight, %blocks.6.1.se.conv_expand.bias) # /usr/local/lib/python3.8/dist-packages/torch/nn/modules/conv.py:395:0
  %907 : Float(1, 1920, 1, 1, strides=[1920, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::Sigmoid(%906) # /usr/local/lib/python3.8/dist-packages/timm/models/layers/activations.py:47:0
  %908 : Float(1, 1920, 8, 8, strides=[122880, 64, 8, 1], requires_grad=1, device=cuda:0) = onnx::Mul(%901, %907) # /usr/local/lib/python3.8/dist-packages/timm/models/efficientnet_blocks.py:48:0
  %1120 : Float(1, 320, 8, 8, strides=[20480, 64, 8, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[1, 1], pads=[0, 0, 0, 0], strides=[1, 1]](%908, %1121, %1122)
  %911 : Float(1, 320, 8, 8, strides=[20480, 64, 8, 1], requires_grad=1, device=cuda:0) = onnx::Add(%1120, %1111) # /usr/local/lib/python3.8/dist-packages/timm/models/efficientnet_blocks.py:208:0
  %1123 : Float(1, 1280, 8, 8, strides=[81920, 64, 8, 1], requires_grad=1, device=cuda:0) = onnx::Conv[dilations=[1, 1], group=1, kernel_shape=[1, 1], pads=[0, 0, 0, 0], strides=[1, 1]](%911, %1124, %1125)
  %914 : Float(1, 1280, 8, 8, strides=[81920, 64, 8, 1], device=cpu) = onnx::Sigmoid(%1123)
  %915 : Float(1, 1280, 8, 8, strides=[81920, 64, 8, 1], requires_grad=1, device=cuda:0) = onnx::Mul(%1123, %914) # /usr/local/lib/python3.8/dist-packages/torch/nn/functional.py:1795:0
  %916 : Float(1, 1280, 1, 1, strides=[1280, 1, 1, 1], requires_grad=1, device=cuda:0) = onnx::GlobalAveragePool(%915) # /usr/local/lib/python3.8/dist-packages/torch/nn/functional.py:1037:0
  %917 : Float(1, 1280, strides=[1280, 1], requires_grad=1, device=cuda:0) = onnx::Flatten[axis=1](%916) # /usr/local/lib/python3.8/dist-packages/timm/models/layers/adaptive_avgmax_pool.py:109:0
  %output : Float(1, 1000, strides=[1000, 1], requires_grad=1, device=cuda:0) = onnx::Gemm[alpha=1., beta=1., transB=1](%917, %classifier.weight, %classifier.bias) # /usr/local/lib/python3.8/dist-packages/torch/nn/functional.py:1753:0
  return (%output)

Duplicate thread