From fad4a10c842a23af2fcc02518e604ebda94f1dcf Mon Sep 17 00:00:00 2001
From: Naren Dasan <naren@narendasan.com>
Date: Tue, 7 Apr 2020 15:55:10 -0700
Subject: [PATCH 01/13] feat(//lowering): centralize lowering and try to use
 PyTorch Conv2DBN folding before using the converter

Signed-off-by: Naren Dasan <naren@narendasan.com>
Signed-off-by: Naren Dasan <narens@nvidia.com>
---
 core/compiler.cpp          | 52 +++++++++++++-------------------------
 core/lowering/lowering.cpp | 28 ++++++++++++++++++--
 core/lowering/lowering.h   |  5 +++-
 3 files changed, 48 insertions(+), 37 deletions(-)

diff --git a/core/compiler.cpp b/core/compiler.cpp
index 33e2f04bff..459efd1356 100644
--- a/core/compiler.cpp
+++ b/core/compiler.cpp
@@ -24,24 +24,24 @@
 namespace trtorch {
 namespace core {
 
-c10::FunctionSchema GenerateGraphSchema(torch::jit::script::Module mod, std::string method_name, std::shared_ptr<torch::jit::Graph>& g) {    
+c10::FunctionSchema GenerateGraphSchema(torch::jit::script::Module mod, std::string method_name, std::shared_ptr<torch::jit::Graph>& g) {
 
     std::vector<c10::Argument> args;
     for (auto in : g->inputs()) {
         args.push_back(c10::Argument(in->debugName(), in->type()));
     }
-    
+
     std::vector<c10::Argument> returns;
     for (auto out : g->outputs()) {
         returns.push_back(c10::Argument(out->debugName(), out->type()));
     }
-    
+
     return c10::FunctionSchema(method_name, method_name, args, returns);
 }
 
 
 void AddEngineToGraph(torch::jit::script::Module mod, std::shared_ptr<torch::jit::Graph>& g, std::string& serialized_engine) {
-    execution::EngineID uid = execution::RegisterEngineFromSerializedEngine(serialized_engine);    
+    execution::EngineID uid = execution::RegisterEngineFromSerializedEngine(serialized_engine);
     auto schema = execution::GetEngineFunctionSchema(uid);
     auto num_io = execution::GetEngineIO(uid);
 
@@ -53,58 +53,42 @@ void AddEngineToGraph(torch::jit::script::Module mod, std::shared_ptr<torch::jit
         in_val->setType(c10::TensorType::get());
         graph_inputs.push_back(in_val);
     }
-    
+
     auto engine_node = g->create(c10::Symbol::fromQualString(schema.name()), torch::jit::ArrayRef<torch::jit::Value*>(graph_inputs), num_io.second);
     g->block()->appendNode(engine_node);
 
     for (auto o : engine_node->outputs()) {
         g->registerOutput(o);
     }
-    
+
     return;
 }
 
 bool CheckMethodOperatorSupport(const torch::jit::script::Module& mod,
                                 std::string method_name) {
-    auto g = mod.get_method(method_name).graph();
-    // Go through PyTorch Lowering to simplify graph and extract weight parameters
-    auto graph_and_parameters = torch::jit::LowerGraph(*g, mod._ivalue());
-    
-    g = graph_and_parameters.first;
-    
-    // Go through TRTorch Lowering to reformat graph to be conversion friendly
-    // and also segment for accelerators and executors (TRT-DLA, TRT-GPU, PYT)
-    lowering::LowerGraph(g);
-    
+    // Go through Lowering to simplify graph and extract weight parameters
+    auto graph_and_parameters = lowering::Lower(mod, method_name);
+
+    auto g = graph_and_parameters.first;
     auto params = graph_and_parameters.second;
     auto named_params = conversion::get_named_params(g->inputs(), params);
     LOG_DEBUG(*g << "(CheckMethodOperatorSupport)\n");
-    
-    // Is this necessary?
-    lowering::LowerBlock(g->block());
-    
+
     return conversion::VerifyConverterSupportForBlock(g->block());
 }
 
 std::string ConvertGraphToTRTEngine(const torch::jit::script::Module& mod,
                                     std::string method_name,
                                     conversion::ExtraInfo cfg) {
-    auto g = mod.get_method(method_name).graph();
-    // Go through PyTorch Lowering to simplify graph and extract weight parameters
-    auto graph_and_parameters = torch::jit::LowerGraph(*g, mod._ivalue());
-    
-    g = graph_and_parameters.first;
-    
-    // Go through TRTorch Lowering to reformat graph to be conversion friendly
-    // and also segment for accelerators and executors (TRT-DLA, TRT-GPU, PYT)
-    lowering::LowerGraph(g);
-    
+    // Go through Lowering to simplify graph and extract weight parameters
+    auto graph_and_parameters = lowering::Lower(mod, method_name);
+
+    auto g = graph_and_parameters.first;
     auto params = graph_and_parameters.second;
     auto named_params = conversion::get_named_params(g->inputs(), params);
+
     LOG_INFO(*g << "(CompileGraph)\n");
-    
-    // Is this necessary?
-    lowering::LowerBlock(g->block());
+
     auto engine = ConvertBlockToEngine(g->block(), cfg, named_params);
     return std::move(engine);
 }
@@ -128,7 +112,7 @@ torch::jit::script::Module CompileGraph(const torch::jit::script::Module& mod,
 
     return new_mod;
 }
-                   
+
 } // namespace core
 } // namespace trtorch
 
diff --git a/core/lowering/lowering.cpp b/core/lowering/lowering.cpp
index f7f1254e00..d18a9d612d 100644
--- a/core/lowering/lowering.cpp
+++ b/core/lowering/lowering.cpp
@@ -1,5 +1,7 @@
-#include "torch/csrc/jit/passes/fuse_linear.h"
 #include "torch/csrc/jit/passes/dead_code_elimination.h"
+#include "torch/csrc/jit/passes/fuse_linear.h"
+#include "torch/csrc/jit/passes/lower_graph.h"
+#include "torch/csrc/jit/passes/quantization.h"
 
 #include "core/lowering/lowering.h"
 #include "core/lowering/irfusers/irfusers.h"
@@ -22,7 +24,29 @@ void LowerGraph(std::shared_ptr<torch::jit::Graph>& g) {
     //irfusers::UnpackBatchNorm(g);
     //torch::jit::EliminateDeadCode(g);
 }
-    
+
+void LowerModule(const torch::jit::script::Module& mod) {
+    torch::jit::FoldConvBatchNorm2d(mod);
+}
+
+std::pair<std::shared_ptr<torch::jit::Graph>, std::vector<at::Tensor>> Lower(const torch::jit::script::Module& mod,
+                                                                            std::string method_name) {
+    LowerModule(mod);
+    auto g = mod.get_method(method_name).graph();
+    // Go through PyTorch Lowering to simplify graph and extract weight parameters
+    auto graph_and_parameters = torch::jit::LowerGraph(*g, mod._ivalue());
+
+    g = graph_and_parameters.first;
+
+    // Go through TRTorch Lowering to reformat graph to be conversion friendly
+    // and also segment for accelerators and executors (TRT-DLA, TRT-GPU, PYT)
+    lowering::LowerGraph(g);
+    // Is this necessary?
+    lowering::LowerBlock(g->block());
+    return graph_and_parameters;
+}
+
+
 } // namespace lowering
 } // namespace core
 } // namespace trtorch
diff --git a/core/lowering/lowering.h b/core/lowering/lowering.h
index ed34ee30a0..95547c778c 100644
--- a/core/lowering/lowering.h
+++ b/core/lowering/lowering.h
@@ -5,9 +5,12 @@
 namespace trtorch {
 namespace core {
 namespace lowering {
-    
+
 void LowerBlock(torch::jit::Block* b);
 void LowerGraph(std::shared_ptr<torch::jit::Graph>& g);
+void LowerModule(const torch::jit::script::Module& mod);
+std::pair<std::shared_ptr<torch::jit::Graph>, std::vector<at::Tensor>> Lower(const torch::jit::script::Module& mod,
+                                                                             std::string method_name);
 
 } // namespace lowering
 } // namespace core

From a51c7b6abce5d1db0cd6bd8dd404dfef4b32efda Mon Sep 17 00:00:00 2001
From: Naren Dasan <naren@narendasan.com>
Date: Tue, 28 Apr 2020 18:05:18 -0700
Subject: [PATCH 02/13] refactor: Upgrading to LibTorch 1.5.0 (CUDA 10.2, cuDNN
 7.6.5, TensorRT 7.0.0)

- Closes #42
- Issue #1 is back, unknown root cause, will follow up with the PyTorch
Team
- Closes #14: The default build now requires users to grab the tarballs
from the NVIDIA website to support hermetic builds, may look at some
methods to smooth this out later. The old method is still available
- New operators need to be implemented to support MobileNet in 1.5.0
(blocks merge into master)

Signed-off-by: Naren Dasan <naren@narendasan.com>
Signed-off-by: Naren Dasan <narens@nvidia.com>
---
 .gitignore                                    |   1 +
 README.md                                     |  81 +++++++--
 WORKSPACE                                     |  54 ++++--
 core/compiler.cpp                             |  21 ++-
 core/compiler.h                               |   2 +-
 core/conversion/conversion.cpp                |   4 +-
 core/conversion/conversion.h                  |   2 +-
 core/conversion/conversion_blacklist.cpp      |   4 +-
 core/conversion/conversionctx/ConversionCtx.h |   2 +-
 .../converters/NodeConverterRegistry.cpp      |   2 +-
 core/conversion/converters/converters.h       |   2 +-
 .../evaluators/NodeEvaluatorRegistry.cpp      |   6 +-
 core/conversion/evaluators/evaluators.h       |   4 +-
 core/conversion/evaluators/prim.cpp           |   4 +-
 core/execution/BUILD                          |   3 +-
 core/execution/TRTEngine.cpp                  |  50 +-----
 core/execution/TRTEngineManager.cpp           |  33 ++--
 core/execution/execution.h                    |   2 -
 core/execution/register_trt_op.cpp            |  67 ++++---
 core/lowering/drop_unused_nodes.cpp           |   6 +-
 core/lowering/irfusers/irfusers.h             |   4 +-
 core/lowering/irfusers/unpack_batch_norm.cpp  |  16 +-
 core/lowering/lowering.h                      |   4 +-
 core/util/jit_util.h                          |   2 +-
 cpp/api/include/trtorch/trtorch.h             |   8 +-
 cpp/api/src/extra_info.cpp                    |   2 +-
 cpp/api/src/trtorch.cpp                       |   2 +-
 tests/core/converters/test_activation.cpp     |  10 +-
 tests/core/converters/test_conv.cpp           |  12 +-
 tests/core/converters/test_element_wise.cpp   |   4 +-
 tests/core/converters/test_linear.cpp         |   6 +-
 tests/core/converters/test_pooling.cpp        |   6 +-
 tests/core/converters/test_reduce.cpp         |   4 +-
 tests/core/converters/test_softmax.cpp        |   8 +-
 tests/core/converters/test_unary.cpp          |   4 +-
 tests/modules/hub.py                          |   6 +-
 tests/modules/module_test.h                   |   3 +
 tests/util/run_forward.cpp                    |   4 +-
 tests/util/run_graph.cpp                      |   8 +-
 tests/util/run_graph_engine.cpp               |   4 +-
 tests/util/util.h                             |   4 +-
 third_party/BUILD                             |   0
 third_party/cuda/BUILD                        |   2 +-
 third_party/cudnn/{ => archive}/BUILD         |   6 +-
 third_party/cudnn/local/BUILD                 |  25 +++
 third_party/libtorch/BUILD                    |  21 ++-
 third_party/tensorrt/archive/BUILD            | 167 ++++++++++++++++++
 third_party/tensorrt/{ => local}/BUILD        |   0
 48 files changed, 459 insertions(+), 233 deletions(-)
 create mode 100644 third_party/BUILD
 rename third_party/cudnn/{ => archive}/BUILD (82%)
 create mode 100644 third_party/cudnn/local/BUILD
 create mode 100644 third_party/tensorrt/archive/BUILD
 rename third_party/tensorrt/{ => local}/BUILD (100%)

diff --git a/.gitignore b/.gitignore
index 2fc1b55b80..c757ba7d1e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -24,3 +24,4 @@ cpp/ptq/datasets/data/
 tests/accuracy/datasets/data/*
 ._.DS_Store
 *.tar.gz
+*.tgz
\ No newline at end of file
diff --git a/README.md b/README.md
index fbb7fff688..599c887daa 100644
--- a/README.md
+++ b/README.md
@@ -2,7 +2,7 @@
 
 > Ahead of Time (AOT) compiling for PyTorch JIT
 
-TRTorch is a compiler for PyTorch/TorchScript, targeting NVIDIA GPUs via NVIDIA's TensorRT Deep Learning Optimizer and Runtime. Unlike PyTorch's Just-In-Time (JIT) compiler, TRTorch is an Ahead-of-Time (AOT) compiler, meaning that before you deploy your TorchScript code, you go through an explicit compile step to convert a standard TorchScript program into an module targeting a TensorRT engine. TRTorch operates as a PyTorch extention and compiles modules that integrate into the JIT runtime seamlessly. After compilation using the optimized graph should feel no different than running a TorchScript module. You also have access to TensorRT's suite of configurations at compile time, so you are able to specify operating precision (FP32/F16) and other settings for your module.
+TRTorch is a compiler for PyTorch/TorchScript, targeting NVIDIA GPUs via NVIDIA's TensorRT Deep Learning Optimizer and Runtime. Unlike PyTorch's Just-In-Time (JIT) compiler, TRTorch is an Ahead-of-Time (AOT) compiler, meaning that before you deploy your TorchScript code, you go through an explicit compile step to convert a standard TorchScript program into an module targeting a TensorRT engine. TRTorch operates as a PyTorch extention and compiles modules that integrate into the JIT runtime seamlessly. After compilation using the optimized graph should feel no different than running a TorchScript module. You also have access to TensorRT's suite of configurations at compile time, so you are able to specify operating precision (FP32/F16/INT8) and other settings for your module.
 
 More Information / System Architecture:
 
@@ -35,17 +35,17 @@ auto results = trt_mod.forward({in_tensor});
 | Platform | Support |
 | -------- | ------- |
 | Linux AMD64 / GPU   | **Supported** |
-| Linux aarch64 / GPU | **Planned/Possible with Native Compiation and small modifications to the build system** |
+| Linux aarch64 / GPU | **Planned/Possible with Native Compiation but untested** |
 | Linux aarch64 / DLA | **Planned/Possible with Native Compilation but untested** |
 | Windows / GPU       | - |
 | Linux ppc64le / GPU | - |
 
 ### Dependencies
 
-- Libtorch 1.4.0
-- CUDA 10.1
-- cuDNN 7.6
-- TensorRT 6.0.1
+- Libtorch 1.5.0
+- CUDA 10.2
+- cuDNN 7.6.5
+- TensorRT 7.0.0
 
 ## Prebuilt Binaries
 
@@ -53,10 +53,71 @@ Releases: https://github.com/NVIDIA/TRTorch/releases
 
 ## Compiling TRTorch
 
-Install TensorRT, CUDA and cuDNN on the system before starting to compile.
+### Installing Dependencies
 
+You need to start by having CUDA installed on the system, Libtorch will automatically be pulled for you by bazel,
+then you have two options.
+
+#### 1. Building using cuDNN & TensorRT tarball distributions
+
+> This is recommended so as to build TRTorch hermetically and insures any bugs are not caused by version issues
+
+> Make sure when running TRTorch that these versions of the libraries are prioritized in your `$LD_LIBRARY_PATH`
+
+1. You need to download the tarball distributions of TensorRT and cuDNN from the NVIDIA website.
+    - https://developer.nvidia.com/cudnn
+    - https://developer.nvidia.com/tensorrt
+2. Place these files in a directory (the directories `thrid_party/distdir/[x86_64-linux-gnu | aarch64-linux-gnu]` exist for this purpose)
+3. Compile using:
+``` shell
+bazel build //:libtrtorch --compilation_mode opt --distdir thrid_party/distdir/[x86_64-linux-gnu | aarch64-linux-gnu]
+```
+
+#### 2. Building using locally installed cuDNN & TensorRT
+
+> If you find bugs and you compiled using this method please disclose it in the issue
+> (an `ldd` dump would be nice too)
+
+1. Install TensorRT, CUDA and cuDNN on the system before starting to compile.
+2. In `WORKSPACE` comment out
+```py
+# Downloaded distributions to use with --distdir
+http_archive(
+    name = "cudnn",
+    urls = ["<URL>",],
+
+    build_file = "@//third_party/cudnn/archive:BUILD",
+    sha256 = "<TAR SHA256>",
+    strip_prefix = "cuda"
+)
+
+http_archive(
+    name = "tensorrt",
+    urls = ["<URL>",],
+
+    build_file = "@//third_party/tensorrt/archive:BUILD",
+    sha256 = "<TAR SHA256>",
+    strip_prefix = "TensorRT-<VERSION>"
+)
+```
+and uncomment
+```py
+# Locally installed dependencies
+new_local_repository(
+    name = "cudnn",
+    path = "/usr/",
+    build_file = "@//third_party/cudnn/local:BUILD"
+)
+
+new_local_repository(
+   name = "tensorrt",
+   path = "/usr/",
+   build_file = "@//third_party/tensorrt/local:BUILD"
+)
+```
+3. Compile using:
 ``` shell
-bazel build //:libtrtorch --compilation_mode=opt
+bazel build //:libtrtorch --compilation_mode opt
 ```
 
 ### Debug build
@@ -84,9 +145,9 @@ Thanks for wanting to contribute! There are two main ways to handle supporting a
 
 ### In my application?
 
-> The Node Converter Registry is not exposed in the top level API but you can try using the internal headers shipped with the tarball.
+> The Node Converter Registry is not exposed in the top level API but in the internal headers shipped with the tarball.
 
-You can register a converter for your op using the NodeConverterRegistry inside your application.
+You can register a converter for your op using the `NodeConverterRegistry` inside your application.
 
 ## Structure of the repo
 
diff --git a/WORKSPACE b/WORKSPACE
index 9714cccae4..f93c2f7e1c 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -16,15 +16,6 @@ py_repositories()
 load("@rules_python//python:pip.bzl", "pip_repositories", "pip_import")
 pip_repositories()
 
-http_archive(
-    name = "libtorch",
-    build_file = "@//third_party/libtorch:BUILD",
-    strip_prefix = "libtorch",
-    urls = ["https://download.pytorch.org/libtorch/cu101/libtorch-cxx11-abi-shared-with-deps-1.4.0.zip"],
-    sha256 = "f214bfde532877aa5d4e0803e51a28fa8edd97b6a44b6615f75a70352b6b542e"
-)
-
-load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
 http_archive(
     name = "rules_pkg",
     url = "https://github.com/bazelbuild/rules_pkg/releases/download/0.2.4/rules_pkg-0.2.4.tar.gz",
@@ -34,24 +25,53 @@ http_archive(
 load("@rules_pkg//:deps.bzl", "rules_pkg_dependencies")
 rules_pkg_dependencies()
 
+# CUDA should be installed on the system locally
 new_local_repository(
     name = "cuda",
-    path = "/usr/local/cuda-10.1/targets/x86_64-linux/",
+    path = "/usr/local/cuda-10.2/targets/x86_64-linux/",
     build_file = "@//third_party/cuda:BUILD",
 )
 
-new_local_repository(
+http_archive(
+    name = "libtorch",
+    build_file = "@//third_party/libtorch:BUILD",
+    strip_prefix = "libtorch",
+    urls = ["https://download.pytorch.org/libtorch/cu102/libtorch-cxx11-abi-shared-with-deps-1.5.0.zip"],
+    sha256 = "0efdd4e709ab11088fa75f0501c19b0e294404231442bab1d1fb953924feb6b5"
+)
+
+# Downloaded distributions to use with --distdir
+http_archive(
     name = "cudnn",
-    path = "/usr/",
-    build_file = "@//third_party/cudnn:BUILD"
+    urls = ["https://developer.nvidia.com/compute/machine-learning/cudnn/secure/7.6.5.32/Production/10.2_20191118/cudnn-10.2-linux-x64-v7.6.5.32.tgz",],
+
+    build_file = "@//third_party/cudnn/archive:BUILD",
+    sha256 = "600267f2caaed2fd58eb214ba669d8ea35f396a7d19b94822e6b36f9f7088c20",
+    strip_prefix = "cuda"
 )
 
-new_local_repository(
-   name = "tensorrt",
-   path = "/usr/",
-   build_file = "@//third_party/tensorrt:BUILD"
+http_archive(
+    name = "tensorrt",
+    urls = ["https://developer.nvidia.com/compute/machine-learning/tensorrt/secure/7.0/7.0.0.11/tars/TensorRT-7.0.0.11.Ubuntu-18.04.x86_64-gnu.cuda-10.2.cudnn7.6.tar.gz",],
+
+    build_file = "@//third_party/tensorrt/archive:BUILD",
+    sha256 = "c7d73b2585b18aae68b740249efa8c8ba5ae852abe9a023720595432a8eb4efd",
+    strip_prefix = "TensorRT-7.0.0.11"
 )
 
+## Locally installed dependencies
+# new_local_repository(
+#    name = "cudnn",
+#    path = "/usr/",
+#    build_file = "@//third_party/cudnn/local:BUILD"
+#)
+
+# new_local_repository(
+#   name = "tensorrt",
+#   path = "/usr/",
+#   build_file = "@//third_party/tensorrt/local:BUILD"
+#)
+
 git_repository(
     name = "googletest",
     remote = "https://github.com/google/googletest",
diff --git a/core/compiler.cpp b/core/compiler.cpp
index 9e3a69033b..44d212a59c 100644
--- a/core/compiler.cpp
+++ b/core/compiler.cpp
@@ -7,12 +7,11 @@
 
 #include "ATen/core/function_schema.h"
 
-#include "torch/csrc/jit/ir.h"
-#include "torch/csrc/jit/pass_manager.h"
+#include "torch/csrc/jit/frontend/function_schema_parser.h"
+#include "torch/csrc/jit/ir/ir.h"
+#include "torch/csrc/jit/passes/pass_manager.h"
 #include "torch/csrc/jit/passes/lower_graph.h"
 #include "torch/csrc/jit/passes/graph_fuser.h"
-#include "torch/csrc/jit/script/module.h"
-#include "torch/csrc/jit/script/function_schema_parser.h"
 
 #include "core/util/prelude.h"
 #include "core/compiler.h"
@@ -42,25 +41,31 @@ c10::FunctionSchema GenerateGraphSchema(torch::jit::script::Module mod, std::str
 
 void AddEngineToGraph(torch::jit::script::Module mod, std::shared_ptr<torch::jit::Graph>& g, std::string& serialized_engine) {
     execution::EngineID uid = execution::RegisterEngineFromSerializedEngine(serialized_engine);
-    auto schema = execution::GetEngineFunctionSchema(uid);
     auto num_io = execution::GetEngineIO(uid);
 
     auto self = g->addInput("self.1");
     self->setType(mod.type());
-    std::vector<torch::jit::Value*> graph_inputs;
+
+    auto id_val = g->insertConstant(uid);
+
+    std::vector<torch::jit::Value*> engine_inputs;
+    engine_inputs.push_back(id_val);
+
     for (uint64_t i = 0; i < num_io.first; i++) {
         auto in_val = g->addInput("");
         in_val->setType(c10::TensorType::get());
-        graph_inputs.push_back(in_val);
+        engine_inputs.push_back(in_val);
     }
 
-    auto engine_node = g->create(c10::Symbol::fromQualString(schema.name()), torch::jit::ArrayRef<torch::jit::Value*>(graph_inputs), num_io.second);
+    auto engine_node = g->create(c10::Symbol::fromQualString("trt::execute_engine"), torch::jit::ArrayRef<torch::jit::Value*>(engine_inputs), num_io.second);
     g->block()->appendNode(engine_node);
 
     for (auto o : engine_node->outputs()) {
         g->registerOutput(o);
     }
 
+    LOG_DEBUG(*g << "(AddEngineToGraph)\n");
+
     return;
 }
 
diff --git a/core/compiler.h b/core/compiler.h
index a8b248d3a2..f9ff400159 100644
--- a/core/compiler.h
+++ b/core/compiler.h
@@ -1,7 +1,7 @@
 #pragma once
 
 #include <vector>
-#include "torch/csrc/jit/script/module.h"
+#include "torch/csrc/jit/api/module.h"
 #include "core/conversion/conversion.h"
 
 namespace trtorch {
diff --git a/core/conversion/conversion.cpp b/core/conversion/conversion.cpp
index 3d0dca2f9f..4b5fbf0fd2 100644
--- a/core/conversion/conversion.cpp
+++ b/core/conversion/conversion.cpp
@@ -14,9 +14,7 @@ namespace conversion {
 bool isNodeConversionBlacklisted(const torch::jit::Node* n);
 
 bool OpSupported(const torch::jit::Node* n) {
-    bool evalable = evaluators::shouldEvalAtConversionTime(n);
-    bool convertable = converters::node_is_convertable(n);
-    return evalable || convertable;
+    return evaluators::shouldEvalAtConversionTime(n) || converters::node_is_convertable(n);
 }
 
 c10::optional<torch::jit::IValue> EvaluateNode(ConversionCtx* ctx, const torch::jit::Node* n, int level=0, int limit=10) {
diff --git a/core/conversion/conversion.h b/core/conversion/conversion.h
index c7a50a6319..f60e203f55 100644
--- a/core/conversion/conversion.h
+++ b/core/conversion/conversion.h
@@ -3,7 +3,7 @@
 #include <map>
 
 #include "NvInfer.h"
-#include "torch/csrc/jit/ir.h"
+#include "torch/csrc/jit/ir/ir.h"
 #include "core/conversion/conversionctx/ConversionCtx.h"
 
 namespace torch {
diff --git a/core/conversion/conversion_blacklist.cpp b/core/conversion/conversion_blacklist.cpp
index c20ccc7db7..9cdf5bd016 100644
--- a/core/conversion/conversion_blacklist.cpp
+++ b/core/conversion/conversion_blacklist.cpp
@@ -1,12 +1,12 @@
 #include <string>
 #include <unordered_set>
 
-#include "torch/csrc/jit/ir.h"
+#include "torch/csrc/jit/ir/ir.h"
 
 namespace trtorch {
 namespace core {
 namespace conversion {
-    
+
 const std::unordered_set<std::string>& get_non_convertable_nodes() {
     // Set of nodes that should not invoke a converter or evaluator
     static std::unordered_set<std::string> nonconvertable_nodes = {
diff --git a/core/conversion/conversionctx/ConversionCtx.h b/core/conversion/conversionctx/ConversionCtx.h
index b7922a319d..c06b816107 100644
--- a/core/conversion/conversionctx/ConversionCtx.h
+++ b/core/conversion/conversionctx/ConversionCtx.h
@@ -5,7 +5,7 @@
 #include <memory>
 
 //#include "ATen/ATen.h"
-#include "torch/csrc/jit/ir.h"
+#include "torch/csrc/jit/ir/ir.h"
 #include "NvInfer.h"
 
 #include "core/util/prelude.h"
diff --git a/core/conversion/converters/NodeConverterRegistry.cpp b/core/conversion/converters/NodeConverterRegistry.cpp
index ff175c6023..04bddad002 100644
--- a/core/conversion/converters/NodeConverterRegistry.cpp
+++ b/core/conversion/converters/NodeConverterRegistry.cpp
@@ -1,6 +1,6 @@
 #include "core/util/prelude.h"
 #include "core/conversion/converters/converters.h"
-#include "torch/csrc/jit/script/function_schema_parser.h"
+#include "torch/csrc/jit/frontend/function_schema_parser.h"
 
 namespace trtorch {
 namespace core {
diff --git a/core/conversion/converters/converters.h b/core/conversion/converters/converters.h
index 1b6f3b916f..465c952ceb 100644
--- a/core/conversion/converters/converters.h
+++ b/core/conversion/converters/converters.h
@@ -3,7 +3,7 @@
 #include <string>
 #include <map>
 
-#include "torch/csrc/jit/custom_operator.h"
+#include "torch/csrc/jit/runtime/custom_operator.h"
 #include "ATen/core/function_schema.h"
 
 #include "core/conversion/conversionctx/ConversionCtx.h"
diff --git a/core/conversion/evaluators/NodeEvaluatorRegistry.cpp b/core/conversion/evaluators/NodeEvaluatorRegistry.cpp
index a810c44584..6f387326fd 100644
--- a/core/conversion/evaluators/NodeEvaluatorRegistry.cpp
+++ b/core/conversion/evaluators/NodeEvaluatorRegistry.cpp
@@ -1,7 +1,7 @@
 #include <unordered_map>
 
-#include "torch/csrc/jit/ir.h"
-#include "torch/csrc/jit/constants.h"
+#include "torch/csrc/jit/ir/ir.h"
+#include "torch/csrc/jit/ir/constants.h"
 #include "ATen/core/functional.h"
 #include "ATen/core/ivalue.h"
 #include "ATen/core/List.h"
@@ -41,7 +41,7 @@ class NodeEvaluatorRegistry {
             return true;
         }
     }
-    
+
 private:
     EvaluatorLUT evaluator_lut_;
 };
diff --git a/core/conversion/evaluators/evaluators.h b/core/conversion/evaluators/evaluators.h
index 021bc84e62..b2b89c28a0 100644
--- a/core/conversion/evaluators/evaluators.h
+++ b/core/conversion/evaluators/evaluators.h
@@ -3,7 +3,7 @@
 #include <string>
 #include <map>
 
-#include "torch/csrc/jit/ir.h"
+#include "torch/csrc/jit/ir/ir.h"
 
 namespace trtorch {
 namespace core {
@@ -19,7 +19,7 @@ typedef std::map<const torch::jit::Value*, const torch::jit::IValue*> kwargs;
 // when writing evaluators
 typedef std::function<c10::optional<torch::jit::IValue>(const torch::jit::Node*, const kwargs&)> NodeEvaluator;
 
-struct EvalRegistration {    
+struct EvalRegistration {
     torch::jit::NodeKind kind;
     NodeEvaluator evaluator;
 };
diff --git a/core/conversion/evaluators/prim.cpp b/core/conversion/evaluators/prim.cpp
index ee3c76c4cf..64662005b3 100644
--- a/core/conversion/evaluators/prim.cpp
+++ b/core/conversion/evaluators/prim.cpp
@@ -1,5 +1,5 @@
-#include "torch/csrc/jit/ir.h"
-#include "torch/csrc/jit/constants.h"
+#include "torch/csrc/jit/ir/ir.h"
+#include "torch/csrc/jit/ir/constants.h"
 #include "ATen/core/functional.h"
 #include "ATen/core/ivalue.h"
 #include "ATen/core/List.h"
diff --git a/core/execution/BUILD b/core/execution/BUILD
index 452bef5332..ed53cdefcf 100644
--- a/core/execution/BUILD
+++ b/core/execution/BUILD
@@ -14,7 +14,8 @@ cc_library(
         "@tensorrt//:nvinfer",
         "@libtorch//:libtorch",
         "//core/util:prelude"
-    ]
+    ],
+    alwayslink = True,
 )
 
 load("@rules_pkg//:pkg.bzl", "pkg_tar")
diff --git a/core/execution/TRTEngine.cpp b/core/execution/TRTEngine.cpp
index 40fbf75151..0364dcf1c8 100644
--- a/core/execution/TRTEngine.cpp
+++ b/core/execution/TRTEngine.cpp
@@ -1,7 +1,7 @@
 #include <algorithm>
 
 #include "NvInfer.h"
-#include "torch/csrc/jit/script/function_schema_parser.h"
+#include "torch/csrc/jit/frontend/function_schema_parser.h"
 
 #include "core/util/prelude.h"
 #include "core/execution/execution.h"
@@ -10,51 +10,9 @@ namespace trtorch {
 namespace core {
 namespace execution {
 
-std::string slugify(std::string s) {
-    std::replace(s.begin(), s.end(), '.', '_');
-    return s;
-}
-
-c10::FunctionSchema GenerateEngineFunctionSchema(EngineID id, nvinfer1::ICudaEngine* engine, uint64_t num_inputs, uint64_t num_outputs) {
-    std::stringstream ss;
-    ss << "trt::execute_engine_" << std::hex << id << "(";
-
-    std::stringstream in_ss;
-    std::stringstream out_ss;
-
-    uint64_t inputs_parsed = 0;
-    uint64_t outputs_parsed = 0;
-    for (int i = 0; i < engine->getNbBindings(); i++) {
-        if (engine->bindingIsInput(i)) {
-            in_ss << "Tensor in_";
-            in_ss << slugify(engine->getBindingName(i));
-            if (inputs_parsed + 1 < num_inputs) {
-                in_ss << ", ";
-                inputs_parsed++;
-            }
-        } else {
-            out_ss << "Tensor";
-            if (outputs_parsed + 1 < num_outputs) {
-                out_ss << ", ";
-                outputs_parsed++;
-            }
-        }
-    }
-
-    ss << in_ss.str();
-    ss << ") -> (";
-    ss << out_ss.str();
-    ss << ')';
-    return torch::jit::parseSchema(ss.str());
-}
-
-TRTEngine::TRTEngine()
-    : schema(torch::jit::parseSchema("trt::noop() -> ()")) {
-}
-
-TRTEngine::TRTEngine(nvinfer1::ILogger& logger, std::string& serialized_engine)
-    : schema(torch::jit::parseSchema("trt::noop() -> ()")) { // Need a better default
+TRTEngine::TRTEngine() {}
 
+TRTEngine::TRTEngine(nvinfer1::ILogger& logger, std::string& serialized_engine) {
     rt = nvinfer1::createInferRuntime(logger);
 
     cuda_engine = rt->deserializeCudaEngine(serialized_engine.c_str(), serialized_engine.size());
@@ -73,7 +31,6 @@ TRTEngine::TRTEngine(nvinfer1::ILogger& logger, std::string& serialized_engine)
         }
     }
     num_io = std::make_pair(inputs, outputs);
-    schema = GenerateEngineFunctionSchema(id, cuda_engine, inputs, outputs);
 }
 
 TRTEngine& TRTEngine::operator=(const TRTEngine& other) {
@@ -82,7 +39,6 @@ TRTEngine& TRTEngine::operator=(const TRTEngine& other) {
     cuda_engine = other.cuda_engine;
     exec_ctx = other.exec_ctx;
     num_io = other.num_io;
-    schema = other.schema;
     return (*this);
 }
 
diff --git a/core/execution/TRTEngineManager.cpp b/core/execution/TRTEngineManager.cpp
index 1705582ec8..27a6aeff28 100644
--- a/core/execution/TRTEngineManager.cpp
+++ b/core/execution/TRTEngineManager.cpp
@@ -16,33 +16,29 @@ class TRTEngineManager {
     TRTEngine* get_engine(EngineID uid) {
         auto iter = engine_registry_.find(uid);
 
-        TRTORCH_ASSERT(iter != engine_registry_.end(), "Unabled to find requested engine (ID: " << uid << ") in TensorRT Execution Manager"); 
+        TRTORCH_ASSERT(iter != engine_registry_.end(), "Unabled to find requested engine (ID: " << uid << ") in TensorRT Execution Manager");
 
         return &(iter->second);
     }
-    
+
     // TODO: Should we have standing engines ready to run or should we be creating execution contexts JIT?
     EngineID register_engine(std::string& serialized_engine) {
         auto engine = TRTEngine(logger_, serialized_engine);
         EngineID uid = engine.id;
         engine_registry_[uid] = std::move(engine);
         LOG_DEBUG(logger_, "Registering new engine (ID: " << std::hex << uid << ") in TensorRT Execution Manager");
-        RegisterEngineOp(engine);
         return uid;
     }
 
     void deregister_engine(EngineID uid) {
         auto iter = engine_registry_.find(uid);
-        if (iter == engine_registry_.end()) {
-             //TODO: Exception
-            LOG_ERROR(logger_, "Unabled to find requested engine (ID: " << uid << ") in TensorRT Execution Manager"); 
-        } else {
-            auto engine = iter->second;
-            // Doing this here since for some reason the destructor causes segfaults
-            engine.exec_ctx->destroy();
-            engine.cuda_engine->destroy();
-            engine_registry_.erase(uid);
-        }
+        TRTORCH_ASSERT(iter != engine_registry_.end(), "Unabled to find requested engine (ID: " << uid << ") in TensorRT Execution Manager");
+
+        auto engine = iter->second;
+        // Doing this here since for some reason the destructor causes segfaults
+        engine.exec_ctx->destroy();
+        engine.cuda_engine->destroy();
+        engine_registry_.erase(uid);
     }
 
 private:
@@ -61,25 +57,20 @@ uint64_t RegisterEngineFromSerializedEngine(std::string& serialized_engine) {
 }
 
 nvinfer1::ICudaEngine* GetCudaEngine(EngineID id) {
-    // Assuming exception will be thrown inside the manager if there is no corresponding engine 
+    // Assuming exception will be thrown inside the manager if there is no corresponding engine
     return get_engine_manager().get_engine(id)->cuda_engine;
 }
 
 nvinfer1::IExecutionContext* GetExecCtx(EngineID id) {
-     // Assuming exception will be thrown inside the manager if there is no corresponding engine 
+     // Assuming exception will be thrown inside the manager if there is no corresponding engine
      return get_engine_manager().get_engine(id)->exec_ctx;
 }
 
 std::pair<uint64_t, uint64_t> GetEngineIO(EngineID id) {
-    // Assuming exception will be thrown inside the manager if there is no corresponding engine 
+    // Assuming exception will be thrown inside the manager if there is no corresponding engine
     return get_engine_manager().get_engine(id)->num_io;
 }
 
-c10::FunctionSchema GetEngineFunctionSchema(EngineID id) {
-    // Assuming exception will be thrown inside the manager if there is no corresponding engine 
-    return get_engine_manager().get_engine(id)->schema;
-}
-
 void DeregisterEngine(EngineID id) {
     get_engine_manager().deregister_engine(id);
 }
diff --git a/core/execution/execution.h b/core/execution/execution.h
index 7f484071eb..2574717d8c 100644
--- a/core/execution/execution.h
+++ b/core/execution/execution.h
@@ -15,7 +15,6 @@ struct TRTEngine {
     nvinfer1::ICudaEngine* cuda_engine;
     nvinfer1::IExecutionContext* exec_ctx;
     std::pair<uint64_t, uint64_t> num_io;
-    c10::FunctionSchema schema;
     EngineID id;
 
     TRTEngine();
@@ -28,7 +27,6 @@ uint64_t RegisterEngineFromSerializedEngine(std::string& serialized_engine);
 nvinfer1::ICudaEngine* GetCudaEngine(EngineID id);
 nvinfer1::IExecutionContext* GetExecCtx(EngineID id);
 std::pair<uint64_t, uint64_t> GetEngineIO(EngineID id);
-c10::FunctionSchema GetEngineFunctionSchema(EngineID id);
 void DeregisterEngine(EngineID id);
 
 } // namespace execution
diff --git a/core/execution/register_trt_op.cpp b/core/execution/register_trt_op.cpp
index 495c01c0e6..6085a59974 100644
--- a/core/execution/register_trt_op.cpp
+++ b/core/execution/register_trt_op.cpp
@@ -1,7 +1,7 @@
 #include "c10/cuda/CUDAStream.h"
 
 #include "torch/torch.h"
-#include "torch/csrc/jit/custom_operator.h"
+#include "torch/csrc/jit/runtime/custom_operator.h"
 
 #include "core/util/prelude.h"
 #include "core/execution/execution.h"
@@ -9,7 +9,7 @@
 namespace trtorch {
 namespace core {
 namespace execution {
-
+namespace {
 std::vector<at::Tensor> RunCudaEngine(nvinfer1::IExecutionContext* ctx, std::pair<uint64_t, uint64_t> io, std::vector<at::Tensor>& inputs) {
     std::vector<void*> gpu_handles;
 
@@ -47,45 +47,40 @@ std::vector<at::Tensor> RunCudaEngine(nvinfer1::IExecutionContext* ctx, std::pai
     return outputs;
 }
 
-c10::OperatorOptions aliasAnalysisFromSchema() {
-    c10::OperatorOptions result;
-    result.setAliasAnalysis(c10::AliasAnalysisKind::FROM_SCHEMA);
-    return result;
+c10::AliasAnalysisKind aliasAnalysisFromSchema() {
+  return c10::AliasAnalysisKind::FROM_SCHEMA;
 }
 
+// Switched to a global operator because op implementations need to be non-capturing lambdas in PYT 1.5.0+
+torch::jit::RegisterOperators jit_registry({
+    torch::jit::Operator(
+        "trt::execute_engine(int id, ...) -> ...",
+        [](torch::jit::Stack& stack) -> int {
+            size_t num_inputs = torch::jit::pop(stack).toInt();
+            // Verify calling convention (right to left or left to right)
+            std::vector<at::Tensor> inputs;
+            for (uint64_t i = 0; i < num_inputs - 1; i++) {
+                at::Tensor in;
+                torch::jit::pop(stack, in);
+                inputs.insert(inputs.begin(), std::move(in));
+            }
 
-// The other way to do this is to register a generic op something liek
-// trt::execute_engine(int id, Tensor input, ...) -> (Tensor...) but not sure
-// how well that would work
-void RegisterEngineOp(TRTEngine& engine) {
-    EngineID id = engine.id;
-    torch::jit::RegisterOperators jit_registry({
-            torch::jit::Operator(
-                engine.schema,
-                [id](torch::jit::Stack& stack) {
-                    LOG_DEBUG("Attempting to run engine (ID: " << std::hex << id << ")");
-                    auto io = GetEngineIO(id);
-                    auto num_in = io.first;
-                    auto num_out = io.second;
-                    // Verify calling convention (right to left or left to right)
-                    std::vector<at::Tensor> inputs;
-                    for (uint64_t i = 0; i < num_in; i++) {
-                        at::Tensor in;
-                        torch::jit::pop(stack, in);
-                        inputs.insert(inputs.begin(), std::move(in));
-                    }
+            int64_t id = torch::jit::pop(stack).toInt();
+            LOG_DEBUG("Attempting to run engine (ID: " << std::hex << id << ")");
+            auto io = GetEngineIO(id);
+            auto num_out = io.second;
 
-                    auto ctx = GetExecCtx(id);
-                    auto outputs = RunCudaEngine(ctx, io, inputs);
-                    for (uint64_t o = 0; o < num_out; o++) {
-                        torch::jit::push(stack, std::move(outputs[o]));
-                    }
-                    return 0;
-                },
-                aliasAnalysisFromSchema())
-                });
-}
+            auto ctx = GetExecCtx(id);
+            auto outputs = RunCudaEngine(ctx, io, inputs);
+            for (uint64_t o = 0; o < num_out; o++) {
+                torch::jit::push(stack, std::move(outputs[o]));
+            }
+            return 0;
+        },
+        aliasAnalysisFromSchema())
+    });
 
+} // namespace
 } // namespace execution
 } // namespace core
 } // namespace trtorch
diff --git a/core/lowering/drop_unused_nodes.cpp b/core/lowering/drop_unused_nodes.cpp
index baa631cfa7..a99a6d210b 100644
--- a/core/lowering/drop_unused_nodes.cpp
+++ b/core/lowering/drop_unused_nodes.cpp
@@ -1,4 +1,4 @@
-#include "torch/csrc/jit/ir.h"
+#include "torch/csrc/jit/ir/ir.h"
 
 namespace torch {
 namespace jit {
@@ -14,7 +14,7 @@ namespace trtorch {
 namespace core {
 namespace lowering {
 
-// From torch/csrc/jit/interpreter.cpp     
+// From torch/csrc/jit/interpreter.cpp
 void DropUnusedNodes(torch::jit::Block* b) {
     auto create_drop_if_unused = [&](at::ArrayRef<torch::jit::Value*> values) -> torch::jit::Node* {
                                      std::vector<torch::jit::Value*> to_drop;
@@ -26,7 +26,7 @@ void DropUnusedNodes(torch::jit::Block* b) {
                                          return nullptr;
                                      return b->owningGraph()->create(torch::jit::prim::Drop, to_drop, 0);
                                  };
-    
+
     if (auto d = create_drop_if_unused(b->inputs())) {
         b->prependNode(d);
     }
diff --git a/core/lowering/irfusers/irfusers.h b/core/lowering/irfusers/irfusers.h
index d793e3176a..c48f5d112d 100644
--- a/core/lowering/irfusers/irfusers.h
+++ b/core/lowering/irfusers/irfusers.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include "torch/csrc/jit/ir.h"
+#include "torch/csrc/jit/ir/ir.h"
 
 namespace trtorch {
 namespace core {
@@ -11,7 +11,7 @@ void FuseFlattenLinear(std::shared_ptr<torch::jit::Graph>& graph);
 void ExpandLogSoftmax(std::shared_ptr<torch::jit::Graph>& graph);
 void RemoveDropout(std::shared_ptr<torch::jit::Graph>& graph);
 void UnpackBatchNorm(std::shared_ptr<torch::jit::Graph>& graph);
-    
+
 } // namespace irfusers
 } // namespace lowering
 } // namespace core
diff --git a/core/lowering/irfusers/unpack_batch_norm.cpp b/core/lowering/irfusers/unpack_batch_norm.cpp
index 76a8ff0739..8a10d747f8 100644
--- a/core/lowering/irfusers/unpack_batch_norm.cpp
+++ b/core/lowering/irfusers/unpack_batch_norm.cpp
@@ -1,14 +1,12 @@
-#include "torch/csrc/jit/custom_operator.h"
+#include "torch/csrc/jit/runtime/custom_operator.h"
 #include "torch/csrc/jit/passes/fuse_linear.h"
 #include "torch/csrc/jit/passes/subgraph_rewrite.h"
 
 namespace torch {
 namespace jit {
 
-c10::OperatorOptions aliasAnalysisFromSchema() {
-    c10::OperatorOptions result;
-    result.setAliasAnalysis(c10::AliasAnalysisKind::FROM_SCHEMA);
-    return result;
+c10::AliasAnalysisKind aliasAnalysisFromSchema() {
+  return c10::AliasAnalysisKind::FROM_SCHEMA;
 }
 
 RegisterOperators trt_const_op_reg({
@@ -30,18 +28,18 @@ namespace irfusers {
 // // May be abusing aten::_tensor_to_list(Tensor self) -> int[]
 // // Treating it as an emit_constant by the converters
 // // We could register a custom op (trt::emit_constant) which we can use to convert
-// // constant tensors to TRT ITensors 
+// // constant tensors to TRT ITensors
 void UnpackBatchNorm(std::shared_ptr<torch::jit::Graph>& graph) {
     // Convert BatchNorm into individual operators
     // batch_norm = gamma * (in - mu) / sqrt(var + epsilon) + beta
     std::string batch_norm_pattern = R"IR(
-       graph(%input, %gamma, %beta, %mean, 
+       graph(%input, %gamma, %beta, %mean,
              %var, %training, %momentum, %eps, %cudnn):
            %1 = aten::batch_norm(%input, %gamma, %beta, %mean, %var, %training, %momentum, %eps, %cudnn)
            return (%1))IR";
 
     std::string expanded_batch_norm_pattern = R"IR(
-        graph(%input, %gamma, %beta, %mean, 
+        graph(%input, %gamma, %beta, %mean,
               %var, %training, %momentum, %eps, %cudnn):
             %gamma_trt = trt::const(%gamma)
             %beta_trt = trt::const(%beta)
@@ -57,7 +55,7 @@ void UnpackBatchNorm(std::shared_ptr<torch::jit::Graph>& graph) {
             %7: Scalar = prim::Constant[value=1]()
             %8 = aten::add(%6, %beta_trt, %7)
             return(%8))IR";
-    
+
     torch::jit::SubgraphRewriter unpack_batch_norm;
     unpack_batch_norm.RegisterRewritePattern(batch_norm_pattern, expanded_batch_norm_pattern);
     unpack_batch_norm.runOnGraph(graph);
diff --git a/core/lowering/lowering.h b/core/lowering/lowering.h
index ed34ee30a0..74efc84036 100644
--- a/core/lowering/lowering.h
+++ b/core/lowering/lowering.h
@@ -1,11 +1,11 @@
 #pragma once
 #include <memory>
-#include "torch/csrc/jit/ir.h"
+#include "torch/csrc/jit/ir/ir.h"
 
 namespace trtorch {
 namespace core {
 namespace lowering {
-    
+
 void LowerBlock(torch::jit::Block* b);
 void LowerGraph(std::shared_ptr<torch::jit::Graph>& g);
 
diff --git a/core/util/jit_util.h b/core/util/jit_util.h
index d77d3229a6..538be211a6 100644
--- a/core/util/jit_util.h
+++ b/core/util/jit_util.h
@@ -3,7 +3,7 @@
 #include <string>
 #include <sstream>
 
-#include "torch/csrc/jit/ir.h"
+#include "torch/csrc/jit/ir/ir.h"
 
 namespace trtorch {
 namespace core {
diff --git a/cpp/api/include/trtorch/trtorch.h b/cpp/api/include/trtorch/trtorch.h
index b2c99a136e..865fe021c5 100644
--- a/cpp/api/include/trtorch/trtorch.h
+++ b/cpp/api/include/trtorch/trtorch.h
@@ -16,9 +16,7 @@
 namespace torch {
 namespace jit {
 struct Graph;
-namespace script {
 struct Module;
-} // namespace script
 } // namespace jit
 } // namespace torch
 
@@ -240,7 +238,7 @@ TRTORCH_API void dump_build_info();
  *
  * Will print out a list of unsupported operators if the graph is unsupported
  */
-TRTORCH_API bool CheckMethodOperatorSupport(const torch::jit::script::Module& module, std::string method_name);
+TRTORCH_API bool CheckMethodOperatorSupport(const torch::jit::Module& module, std::string method_name);
 
 /**
  * @brief Compile a TorchScript module for NVIDIA GPUs using TensorRT
@@ -253,7 +251,7 @@ TRTORCH_API bool CheckMethodOperatorSupport(const torch::jit::script::Module& mo
  *
  * Converts specifically the forward method of a TorchScript Module
  */
-TRTORCH_API torch::jit::script::Module CompileGraph(const torch::jit::script::Module& module, ExtraInfo info);
+TRTORCH_API torch::jit::script::Module CompileGraph(const torch::jit::Module& module, ExtraInfo info);
 
 /**
  * @brief Compile a TorchScript method for NVIDIA GPUs using TensorRT
@@ -266,7 +264,7 @@ TRTORCH_API torch::jit::script::Module CompileGraph(const torch::jit::script::Mo
  * and will convert selected method to a serialized TensorRT engine which can be run with
  * TensorRT
  */
-TRTORCH_API std::string ConvertGraphToTRTEngine(const torch::jit::script::Module& module, std::string method_name, ExtraInfo info);
+TRTORCH_API std::string ConvertGraphToTRTEngine(const torch::jit::Module& module, std::string method_name, ExtraInfo info);
 
 namespace ptq {
 /**
diff --git a/cpp/api/src/extra_info.cpp b/cpp/api/src/extra_info.cpp
index f7fc5709e9..5bc12fa204 100644
--- a/cpp/api/src/extra_info.cpp
+++ b/cpp/api/src/extra_info.cpp
@@ -1,4 +1,4 @@
-#include "torch/csrc/jit/script/module.h"
+#include "torch/csrc/jit/api/module.h"
 
 #include "core/util/prelude.h"
 #include "core/compiler.h"
diff --git a/cpp/api/src/trtorch.cpp b/cpp/api/src/trtorch.cpp
index bb8e5a7845..e6a1940db1 100644
--- a/cpp/api/src/trtorch.cpp
+++ b/cpp/api/src/trtorch.cpp
@@ -1,4 +1,4 @@
-#include "torch/csrc/jit/script/module.h"
+#include "torch/csrc/jit/api/module.h"
 
 #include "core/util/prelude.h"
 #include "core/compiler.h"
diff --git a/tests/core/converters/test_activation.cpp b/tests/core/converters/test_activation.cpp
index bae82a4fe3..299420878c 100644
--- a/tests/core/converters/test_activation.cpp
+++ b/tests/core/converters/test_activation.cpp
@@ -1,6 +1,6 @@
 #include <string>
 #include "gtest/gtest.h"
-#include "torch/csrc/jit/irparser.h"
+#include "torch/csrc/jit/ir/irparser.h"
 #include "tests/util/util.h"
 #include "core/compiler.h"
 
@@ -11,7 +11,7 @@ TEST(Converters, ATenReLUConvertsCorrectly) {
         return (%3))IR";
 
     auto g = std::make_shared<torch::jit::Graph>();
-    torch::jit::script::parseIR(graph, &*g);
+    torch::jit::parseIR(graph, &*g);
 
     auto in = at::randint(-5, 5, {5}, {at::kCUDA});
     auto params = trtorch::core::conversion::get_named_params(g->inputs(), {});
@@ -31,7 +31,7 @@ TEST(Converters, ATenSigmoidConvertsCorrectly) {
         return (%3))IR";
 
     auto g = std::make_shared<torch::jit::Graph>();
-    torch::jit::script::parseIR(graph, &*g);
+    torch::jit::parseIR(graph, &*g);
 
     auto in = at::randint(-5, 5, {5}, {at::kCUDA});
     auto params = trtorch::core::conversion::get_named_params(g->inputs(), {});
@@ -51,7 +51,7 @@ TEST(Converters, ATenTanhConvertsCorrectly) {
         return (%3))IR";
 
     auto g = std::make_shared<torch::jit::Graph>();
-    torch::jit::script::parseIR(graph, &*g);
+    torch::jit::parseIR(graph, &*g);
 
     auto in = at::randint(-5, 5, {5}, {at::kCUDA});
     auto params = trtorch::core::conversion::get_named_params(g->inputs(), {});
@@ -96,7 +96,7 @@ TEST(Converters, ATenHardTanhCustomRangeConvertsCorrectly) {
         return (%3))IR";
 
     auto g = std::make_shared<torch::jit::Graph>();
-    torch::jit::script::parseIR(graph, &*g);
+    torch::jit::parseIR(graph, &*g);
 
     auto in = at::randint(-5, 5, {5}, {at::kCUDA});
     auto params = trtorch::core::conversion::get_named_params(g->inputs(), {});
diff --git a/tests/core/converters/test_conv.cpp b/tests/core/converters/test_conv.cpp
index 69f040bb43..154a5b06ca 100644
--- a/tests/core/converters/test_conv.cpp
+++ b/tests/core/converters/test_conv.cpp
@@ -1,6 +1,6 @@
 #include <string>
 #include "gtest/gtest.h"
-#include "torch/csrc/jit/irparser.h"
+#include "torch/csrc/jit/ir/irparser.h"
 #include "tests/util/util.h"
 #include "core/compiler.h"
 
@@ -12,7 +12,7 @@
 
 void conv_test_helper(std::string graph_ir) {
     auto g = std::make_shared<torch::jit::Graph>();
-    torch::jit::script::parseIR(graph_ir, &*g);
+    torch::jit::parseIR(graph_ir, &*g);
 
     auto in = at::randint(1, 10, {1, 3, 10, 10}, {at::kCUDA});
     auto w = at::randint(1, 10, {8, 3, 5, 5}, {at::kCUDA});
@@ -54,7 +54,7 @@ TEST(Converters, ATenConvolutionConvertsCorrectly) {
         return (%12))IR";
 
     auto g = std::make_shared<torch::jit::Graph>();
-    torch::jit::script::parseIR(graph, &*g);
+    torch::jit::parseIR(graph, &*g);
 
     auto in = at::randint(1, 10, {1, 3, 10, 10}, {at::kCUDA});
     auto w = at::randint(1, 10, {8, 3, 5, 5}, {at::kCUDA});
@@ -96,7 +96,7 @@ TEST(Converters, ATenConvolutionNoBiasConvertsCorrectly) {
         return (%12))IR";
 
     auto g = std::make_shared<torch::jit::Graph>();
-    torch::jit::script::parseIR(graph, &*g);
+    torch::jit::parseIR(graph, &*g);
 
     auto in = at::randint(1, 2, {1, 1, 3, 3}, {at::kCUDA});
     auto w = at::randint(1, 2, {4, 1, 2, 2}, {at::kCUDA});
@@ -136,7 +136,7 @@ TEST(Converters, ATenConvolutionWithStrideConvertsCorrectly) {
         return (%13))IR";
 
     auto g = std::make_shared<torch::jit::Graph>();
-    torch::jit::script::parseIR(graph, &*g);
+    torch::jit::parseIR(graph, &*g);
 
     auto in = at::randint(1, 10, {1, 3, 9, 9}, {at::kCUDA});
     auto w = at::randint(1, 10, {4, 3, 3, 3}, {at::kCUDA});
@@ -179,7 +179,7 @@ TEST(Converters, ATenConvolutionWithPaddingConvertsCorrectly) {
         return (%13))IR";
 
     auto g = std::make_shared<torch::jit::Graph>();
-    torch::jit::script::parseIR(graph, &*g);
+    torch::jit::parseIR(graph, &*g);
 
     auto in = at::randint(1, 10, {1, 3, 4, 4}, {at::kCUDA});
     auto w = at::randint(1, 10, {4, 3, 2, 2}, {at::kCUDA});
diff --git a/tests/core/converters/test_element_wise.cpp b/tests/core/converters/test_element_wise.cpp
index db8ea15870..85cb90f6b7 100644
--- a/tests/core/converters/test_element_wise.cpp
+++ b/tests/core/converters/test_element_wise.cpp
@@ -1,12 +1,12 @@
 #include <string>
 #include "gtest/gtest.h"
-#include "torch/csrc/jit/irparser.h"
+#include "torch/csrc/jit/ir/irparser.h"
 #include "tests/util/util.h"
 #include "core/compiler.h"
 
 void pointwise_test_helper(std::string graph_ir) {
     auto g = std::make_shared<torch::jit::Graph>();
-    torch::jit::script::parseIR(graph_ir, &*g);
+    torch::jit::parseIR(graph_ir, &*g);
 
     auto in0 = at::randint(1, 5, {5}, {at::kCUDA});
     auto in1 = at::randint(1, 5, {5}, {at::kCUDA});
diff --git a/tests/core/converters/test_linear.cpp b/tests/core/converters/test_linear.cpp
index f6d543ab04..c668c03e63 100644
--- a/tests/core/converters/test_linear.cpp
+++ b/tests/core/converters/test_linear.cpp
@@ -1,6 +1,6 @@
 #include <string>
 #include "gtest/gtest.h"
-#include "torch/csrc/jit/irparser.h"
+#include "torch/csrc/jit/ir/irparser.h"
 #include "tests/util/util.h"
 #include "core/compiler.h"
 
@@ -13,7 +13,7 @@ TEST(Converters, ATenLinearNoBiasConvertsCorrectly) {
         return (%3))IR";
 
     auto g = std::make_shared<torch::jit::Graph>();
-    torch::jit::script::parseIR(graph, &*g);
+    torch::jit::parseIR(graph, &*g);
 
     //Input Tensor needs to be 4D for TensorRT linear
     auto in = at::randint(1, 10, {1, 2}, {at::kCUDA});
@@ -40,7 +40,7 @@ TEST(Converters, ATenLinearBiasConvertsCorrectly) {
         return (%3))IR";
 
     auto g = std::make_shared<torch::jit::Graph>();
-    torch::jit::script::parseIR(graph, &*g);
+    torch::jit::parseIR(graph, &*g);
 
     // WARN: TRT expects a 4D input eventually, but pytorch does not require a channel dim
     auto in = at::randint(1, 5, {1, 3}, {at::kCUDA});
diff --git a/tests/core/converters/test_pooling.cpp b/tests/core/converters/test_pooling.cpp
index f278d8b65f..6e7b612612 100644
--- a/tests/core/converters/test_pooling.cpp
+++ b/tests/core/converters/test_pooling.cpp
@@ -1,6 +1,6 @@
 #include <string>
 #include "gtest/gtest.h"
-#include "torch/csrc/jit/irparser.h"
+#include "torch/csrc/jit/ir/irparser.h"
 #include "tests/util/util.h"
 #include "core/compiler.h"
 
@@ -18,7 +18,7 @@ TEST(Converters, ATenMaxPool2DConvertsCorrectly) {
         return (%10))IR";
 
     auto g = std::make_shared<torch::jit::Graph>();
-    torch::jit::script::parseIR(graph, &*g);
+    torch::jit::parseIR(graph, &*g);
 
     //PyTorch MaxPool needs a 3D input
     auto in = at::randint(-5, 5, {1, 4, 4}, at::kCUDA);
@@ -42,7 +42,7 @@ TEST(Converters, ATenAdaptiveAvgPool2DConvertsCorrectly) {
         return (%10))IR";
 
     auto g = std::make_shared<torch::jit::Graph>();
-    torch::jit::script::parseIR(graph, &*g);
+    torch::jit::parseIR(graph, &*g);
 
     //PyTorch MaxPool needs a 3D input
     auto in = at::randint(-5, 5, {1, 12, 16}, at::kCUDA);
diff --git a/tests/core/converters/test_reduce.cpp b/tests/core/converters/test_reduce.cpp
index 81d6e6c606..64834890d4 100644
--- a/tests/core/converters/test_reduce.cpp
+++ b/tests/core/converters/test_reduce.cpp
@@ -1,6 +1,6 @@
 #include <string>
 #include "gtest/gtest.h"
-#include "torch/csrc/jit/irparser.h"
+#include "torch/csrc/jit/ir/irparser.h"
 #include "tests/util/util.h"
 #include "core/compiler.h"
 
@@ -56,7 +56,7 @@ std::string gen_keepdim_graph(const std::string& op) {
 
 void test_body(const std::string& graph, at::Tensor& in) {
   auto g = std::make_shared<torch::jit::Graph>();
-  torch::jit::script::parseIR(graph, &*g);
+  torch::jit::parseIR(graph, &*g);
 
   auto params = trtorch::core::conversion::get_named_params(g->inputs(), {});
   auto jit_results = trtorch::tests::util::RunGraph(g, params, {in});
diff --git a/tests/core/converters/test_softmax.cpp b/tests/core/converters/test_softmax.cpp
index 5f8186021c..87d5030cbf 100644
--- a/tests/core/converters/test_softmax.cpp
+++ b/tests/core/converters/test_softmax.cpp
@@ -1,6 +1,6 @@
 #include <string>
 #include "gtest/gtest.h"
-#include "torch/csrc/jit/irparser.h"
+#include "torch/csrc/jit/ir/irparser.h"
 #include "tests/util/util.h"
 #include "core/compiler.h"
 
@@ -13,7 +13,7 @@ TEST(Converters, ATenSoftmax1DConvertsCorrectly) {
         return (%3))IR";
 
     auto g = std::make_shared<torch::jit::Graph>();
-    torch::jit::script::parseIR(graph, &*g);
+    torch::jit::parseIR(graph, &*g);
 
     auto in = at::randint(0, 5, {5}, {at::kCUDA});
     auto params = trtorch::core::conversion::get_named_params(g->inputs(), {});
@@ -36,7 +36,7 @@ TEST(Converters, ATenSoftmaxNDConvertsCorrectlySub3DIndex) {
         return (%3))IR";
 
     auto g = std::make_shared<torch::jit::Graph>();
-    torch::jit::script::parseIR(graph, &*g);
+    torch::jit::parseIR(graph, &*g);
 
     auto in = at::randint(0, 5, {1,2,2,2,2}, {at::kCUDA});
 
@@ -60,7 +60,7 @@ TEST(Converters, ATenSoftmaxNDConvertsCorrectlyAbove3DIndex) {
         return (%3))IR";
 
     auto g = std::make_shared<torch::jit::Graph>();
-    torch::jit::script::parseIR(graph, &*g);
+    torch::jit::parseIR(graph, &*g);
 
     auto in = at::randint(0, 5, {1,2,2,2,2}, {at::kCUDA});
 
diff --git a/tests/core/converters/test_unary.cpp b/tests/core/converters/test_unary.cpp
index a82a4ade9f..78c4a68da3 100644
--- a/tests/core/converters/test_unary.cpp
+++ b/tests/core/converters/test_unary.cpp
@@ -1,6 +1,6 @@
 #include <string>
 #include "gtest/gtest.h"
-#include "torch/csrc/jit/irparser.h"
+#include "torch/csrc/jit/ir/irparser.h"
 #include "tests/util/util.h"
 #include "core/compiler.h"
 
@@ -19,7 +19,7 @@ std::string gen_test_graph(const std::string &unary) {
     const auto graph = gen_test_graph(#unary);                                    \
                                                                                   \
     auto g = std::make_shared<torch::jit::Graph>();                               \
-    torch::jit::script::parseIR(graph, &*g);                                      \
+    torch::jit::parseIR(graph, &*g);                                              \
                                                                                   \
     auto in = at::empty({10}, {at::kCUDA}).uniform_(0, 0.5);                      \
     auto params =                                                                 \
diff --git a/tests/modules/hub.py b/tests/modules/hub.py
index 8be1047837..b1b3a1a102 100644
--- a/tests/modules/hub.py
+++ b/tests/modules/hub.py
@@ -2,13 +2,12 @@
 import torchvision.models as models
 
 models = {
-          #"resnet18": models.resnet18(pretrained=True),
           "alexnet": models.alexnet(pretrained=True),
           "vgg16": models.vgg16(pretrained=True),
           "squeezenet": models.squeezenet1_0(pretrained=True),
           "densenet": models.densenet161(pretrained=True),
-          "inception": models.inception_v3(pretrained=True),
-          "googlenet": models.googlenet(pretrained=True),
+          "inception_v3": models.inception_v3(pretrained=True),
+          #"googlenet": models.googlenet(pretrained=True),
           "shufflenet": models.shufflenet_v2_x1_0(pretrained=True),
           "mobilenet_v2": models.mobilenet_v2(pretrained=True),
           "resnext50_32x4d": models.resnext50_32x4d(pretrained=True),
@@ -18,6 +17,7 @@
           "resnet50": torch.hub.load('pytorch/vision:v0.5.0', 'resnet50', pretrained=True)}
 
 for n, m in models.items():
+    print("Downloading {}".format(n))
     m = m.eval().cuda()
     x = torch.ones((1, 3, 224, 224)).cuda()
     jit_model = torch.jit.trace(m, x)
diff --git a/tests/modules/module_test.h b/tests/modules/module_test.h
index d7cd7e080b..61f8d0612a 100644
--- a/tests/modules/module_test.h
+++ b/tests/modules/module_test.h
@@ -4,6 +4,8 @@
 #include "tests/util/util.h"
 #include "trtorch/trtorch.h"
 #include "c10/cuda/CUDACachingAllocator.h"
+#include "cuda_runtime_api.h"
+
 
 using PathAndInSize = std::pair<std::string, std::vector<std::vector<int64_t>>>;
 
@@ -25,6 +27,7 @@ class ModuleTests
     }
 
     void TearDown() {
+        cudaDeviceSynchronize();
         c10::cuda::CUDACachingAllocator::emptyCache();
     }
 protected:
diff --git a/tests/util/run_forward.cpp b/tests/util/run_forward.cpp
index f8f22b900d..4806678a1f 100644
--- a/tests/util/run_forward.cpp
+++ b/tests/util/run_forward.cpp
@@ -6,13 +6,13 @@ namespace trtorch {
 namespace tests {
 namespace util {
 
-torch::jit::IValue RunModuleForward(torch::jit::script::Module& mod, std::vector<torch::jit::IValue> inputs) {
+torch::jit::IValue RunModuleForward(torch::jit::Module& mod, std::vector<torch::jit::IValue> inputs) {
     mod.to(at::kCUDA);
     return mod.forward(inputs);
 }
 
 
-std::vector<at::Tensor> RunModuleForwardAsEngine(torch::jit::script::Module& mod, std::vector<at::Tensor> inputs) {
+std::vector<at::Tensor> RunModuleForwardAsEngine(torch::jit::Module& mod, std::vector<at::Tensor> inputs) {
     auto forward_graph = mod.get_method("forward");
     std::vector<c10::ArrayRef<int64_t>> input_ranges;
     for (auto in : inputs) {
diff --git a/tests/util/run_graph.cpp b/tests/util/run_graph.cpp
index 70f839c936..e749eb1e87 100644
--- a/tests/util/run_graph.cpp
+++ b/tests/util/run_graph.cpp
@@ -1,4 +1,4 @@
-#include "torch/csrc/jit/graph_executor.h"
+#include "torch/csrc/jit/runtime/graph_executor.h"
 #include "tests/util/util.h"
 
 namespace trtorch {
@@ -28,15 +28,15 @@ std::vector<at::Tensor> RunGraph(std::shared_ptr<torch::jit::Graph>& g,
         }
     }
 
-    torch::jit::GraphExecutor executor(g);
+    torch::jit::GraphExecutor executor(g, "");
     auto stack = CreateStack(std::move(inputs_));
-    
+
     executor.run(stack);
     std::vector<at::Tensor> outputs;
     for (size_t i = 0; i < g->outputs().size(); i++) {
         outputs.push_back(stack[i].toTensor());
     }
-    
+
     return outputs;
 }
 
diff --git a/tests/util/run_graph_engine.cpp b/tests/util/run_graph_engine.cpp
index a6a6070596..07e0a9f3ce 100644
--- a/tests/util/run_graph_engine.cpp
+++ b/tests/util/run_graph_engine.cpp
@@ -1,8 +1,8 @@
 #include "core/util/prelude.h"
 #include "NvInfer.h"
 #include "c10/cuda/CUDAStream.h"
-#include "torch/csrc/jit/ir.h"
-#include "torch/csrc/jit/irparser.h"
+#include "torch/csrc/jit/ir/ir.h"
+#include "torch/csrc/jit/ir/irparser.h"
 #include "core/conversion/conversion.h"
 #include "cuda_runtime_api.h"
 
diff --git a/tests/util/util.h b/tests/util/util.h
index fa9008bd39..6ae72b3370 100644
--- a/tests/util/util.h
+++ b/tests/util/util.h
@@ -29,11 +29,11 @@ std::vector<at::Tensor> RunGraphEngine(std::shared_ptr<torch::jit::Graph>& g,
                                        std::vector<at::Tensor> inputs);
 
 // Run the forward method of a module and return results
-torch::jit::IValue RunModuleForward(torch::jit::script::Module& mod,
+torch::jit::IValue RunModuleForward(torch::jit::Module& mod,
                                     std::vector<torch::jit::IValue> inputs);
 
 // Convert the forward module to a TRT engine and return results
-std::vector<at::Tensor> RunModuleForwardAsEngine(torch::jit::script::Module& mod,
+std::vector<at::Tensor> RunModuleForwardAsEngine(torch::jit::Module& mod,
                                                  std::vector<at::Tensor> inputs);
 
 
diff --git a/third_party/BUILD b/third_party/BUILD
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/third_party/cuda/BUILD b/third_party/cuda/BUILD
index 651964393c..758a81d0db 100644
--- a/third_party/cuda/BUILD
+++ b/third_party/cuda/BUILD
@@ -3,7 +3,7 @@ package(default_visibility = ["//visibility:public"])
 cc_library(
     name = "cudart",
     srcs = glob([
-        "lib/**/libcudart.so.10.1",
+        "lib/**/libcudart.so.10.2",
     ]),
     hdrs = glob([
         "include/**/*.h",
diff --git a/third_party/cudnn/BUILD b/third_party/cudnn/archive/BUILD
similarity index 82%
rename from third_party/cudnn/BUILD
rename to third_party/cudnn/archive/BUILD
index e6423b6f20..027a1a6bcc 100644
--- a/third_party/cudnn/BUILD
+++ b/third_party/cudnn/archive/BUILD
@@ -6,17 +6,17 @@ cc_library(
     includes = ["include/"],
     visibility = ["//visibility:private"],
 )
-       
+
 cc_import(
     name = "cudnn_lib",
-    shared_library = "lib/x86_64-linux-gnu/libcudnn.so",
+    shared_library = "lib64/libcudnn.so.7.6.5",
     visibility = ["//visibility:private"],
 )
 
 cc_library(
     name = "cudnn",
     deps = [
-        "cudnn_headers", 
+        "cudnn_headers",
         "cudnn_lib"
     ],
     visibility = ["//visibility:public"],
diff --git a/third_party/cudnn/local/BUILD b/third_party/cudnn/local/BUILD
new file mode 100644
index 0000000000..3af56fa61f
--- /dev/null
+++ b/third_party/cudnn/local/BUILD
@@ -0,0 +1,25 @@
+package(default_visibility = ["//visibility:public"])
+
+cc_library(
+    name = "cudnn_headers",
+    hdrs = ["include/cudnn.h"] + glob(["include/cudnn+.h"]),
+    includes = ["include/"],
+    visibility = ["//visibility:private"],
+)
+
+cc_import(
+    name = "cudnn_lib",
+    shared_library = "lib/x86_64-linux-gnu/libcudnn.so.7.6.5,
+    visibility = ["//visibility:private"],
+)
+
+cc_library(
+    name = "cudnn",
+    deps = [
+        "cudnn_headers",
+        "cudnn_lib"
+    ],
+    visibility = ["//visibility:public"],
+)
+
+
diff --git a/third_party/libtorch/BUILD b/third_party/libtorch/BUILD
index 0c5bff0d2f..9da5da47c0 100644
--- a/third_party/libtorch/BUILD
+++ b/third_party/libtorch/BUILD
@@ -18,11 +18,17 @@ cc_library(
     ) + glob([
         'include/torch/csrc/api/include/**/*.h'
     ]),
-    srcs = ['lib/libtorch.so'],
+    srcs = [
+        'lib/libtorch.so',
+        'lib/libtorch_cuda.so',
+        'lib/libtorch_cpu.so',
+        'lib/libtorch_global_deps.so',
+    ],
     deps = [
         ":ATen",
         ":torch_deps",
         ":c10_cuda",
+        ":caffe2"
     ],
     includes = [
         "include",
@@ -34,7 +40,7 @@ cc_library(
     name = 'torch_deps',
     srcs = [
         'lib/libnvToolsExt-3965bdd0.so.1',
-        "lib/libcudart-1b201d85.so.10.1",
+        "lib/libcudart-80664282.so.10.2",
     ]
 )
 
@@ -66,8 +72,7 @@ cc_library(
 cc_library(
     name = "c10_deps",
     srcs = [
-        "lib/libgomp-753e6e92.so.1",
-        "lib/libcudart-1b201d85.so.10.1",
+        "lib/libcudart-80664282.so.10.2",
     ]
 )
 
@@ -84,7 +89,12 @@ cc_library(
     hdrs = glob([
         'include/caffe2/**/*.h'
     ]),
-    srcs = ['lib/libcaffe2_nvrtc.so'],
+    srcs = [
+        'lib/libcaffe2_nvrtc.so',
+        'lib/libcaffe2_detectron_ops_gpu.so',
+        'lib/libcaffe2_observers.so',
+        'lib/libcaffe2_module_test_dynamic.so'
+    ],
     strip_include_prefix = "include",
     deps = [
         ":caffe2_deps"
@@ -95,6 +105,5 @@ cc_library(
     name = 'caffe2_deps',
     srcs = [
         'lib/libnvToolsExt-3965bdd0.so.1',
-        'lib/libgomp-753e6e92.so.1'
     ]
 )
diff --git a/third_party/tensorrt/archive/BUILD b/third_party/tensorrt/archive/BUILD
new file mode 100644
index 0000000000..2dc2959f79
--- /dev/null
+++ b/third_party/tensorrt/archive/BUILD
@@ -0,0 +1,167 @@
+package(default_visibility = ["//visibility:public"])
+
+cc_library(
+    name = "nvinfer_headers",
+    hdrs = glob([
+        "include/NvInfer*.h",
+    ], exclude=[
+        "include/NvInferPlugin.h",
+        "include/NvInferPluginUtils.h"
+    ]),
+    includes = ["include/"],
+    visibility = ["//visibility:private"],
+)
+
+cc_import(
+    name = "nvinfer_lib",
+    shared_library = "lib/libnvinfer.so.7.0.0",
+    visibility = ["//visibility:private"],
+)
+
+cc_library(
+    name = "nvinfer",
+    deps = [
+        "nvinfer_headers",
+        "nvinfer_lib",
+        "@cuda//:cudart",
+        "@cuda//:cublas",
+        "@cudnn//:cudnn"
+    ],
+    visibility = ["//visibility:public"],
+)
+
+####################################################################################
+
+cc_import(
+    name = "nvparsers_lib",
+    shared_library = "lib/libnvparsers.so.7.0.0",
+    visibility = ["//visibility:private"],
+)
+
+
+cc_library(
+    name = "nvparsers_headers",
+    hdrs = [
+        "include/NvCaffeParser.h",
+        "include/NvOnnxParser.h",
+        "include/NvOnnxParserRuntime.h",
+        "include/NvOnnxConfig.h",
+        "include/NvUffParser.h"
+    ],
+    includes = ["include/"],
+    visibility = ["//visibility:private"],
+)
+
+cc_library(
+    name = "nvparsers",
+    deps = [
+        "nvparsers_headers",
+        "nvparsers_lib",
+        "nvinfer"
+    ],
+    visibility = ["//visibility:public"],
+)
+
+####################################################################################
+
+cc_import(
+    name = "nvonnxparser_lib",
+    shared_library = "lib/libnvonnxparser.so.7.0.0",
+    visibility = ["//visibility:private"],
+)
+
+cc_library(
+    name = "nvonnxparser_headers",
+    hdrs = [
+        "include/NvOnnxParser.h",
+        "include/NvOnnxParserRuntime.h",
+        "include/NvOnnxConfig.h"
+    ],
+    includes = ["include/"],
+    visibility = ["//visibility:private"],
+)
+
+cc_library(
+    name = "nvonnxparser",
+    deps = [
+        "nvonnxparser_headers",
+        "nvonnxparser_lib",
+        "nvinfer"
+    ],
+    visibility = ["//visibility:public"],
+)
+
+####################################################################################
+
+cc_import(
+    name = "nvonnxparser_runtime_lib",
+    shared_library = "lib/libnvonnxparser_runtime.so.7.0.0",
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "nvonnxparser_runtime_header",
+    hdrs = ["include/NvOnnxParserRuntime.h"],
+    includes = ["include/"],
+    visibility = ["//visibility:private"],
+)
+
+cc_library(
+    name = "nvonnxparser_runtime",
+    deps = [
+        "nvparsers_headers",
+        "nvparsers_lib",
+        "nvinfer"
+    ],
+    visibility = ["//visibility:public"],
+)
+
+####################################################################################
+
+cc_import(
+    name = "nvcaffeparser_lib",
+    shared_library = "lib/libnvcaffe_parsers.so.7.0.0",
+    visibility = ["//visibility:private"],
+)
+
+cc_library(
+    name = "nvcaffeparser_headers",
+    hdrs = ["include/NvCaffeParser.h"],
+    includes = ["include/"],
+    visibility = ["//visibility:private"],
+)
+
+cc_library(
+    name = "nvcaffeparser",
+    deps = [
+        "nvcaffeparser_headers",
+        "nvcaffeparser_lib",
+        "nvinfer"
+    ],
+    visibility = ["//visibility:public"],
+)
+
+####################################################################################
+
+cc_import(
+    name = "nvinferplugin_lib",
+    shared_library = "lib/libnvinfer_plugin.so.7.0.0",
+    visibility = ["//visibility:private"],
+)
+
+cc_library(
+    name = "nvinferplugin_headers",
+    hdrs = glob(["include/NvInferPlugin*.h"]),
+    includes = ["include/"],
+    visibility = ["//visibility:private"],
+)
+
+cc_library(
+    name = "nvinferplugin",
+    deps = [
+        "nvinferplugin_headers",
+        "nvinferplugin_lib",
+        "nvinfer"
+    ],
+    visibility = ["//visibility:public"],
+)
diff --git a/third_party/tensorrt/BUILD b/third_party/tensorrt/local/BUILD
similarity index 100%
rename from third_party/tensorrt/BUILD
rename to third_party/tensorrt/local/BUILD

From 353f2d252cf795153241512ea497d9e61e6338b5 Mon Sep 17 00:00:00 2001
From: Naren Dasan <naren@narendasan.com>
Date: Tue, 28 Apr 2020 19:01:37 -0700
Subject: [PATCH 03/13] feat(//core/conversion/converters/impl/shuffle):
 Implement aten::resize

Signed-off-by: Naren Dasan <naren@narendasan.com>
Signed-off-by: Naren Dasan <narens@nvidia.com>
---
 core/conversion/converters/BUILD            |  1 +
 core/conversion/converters/converters.h     |  1 +
 core/conversion/converters/impl/shuffle.cpp | 33 +++++++++++++++++++++
 core/conversion/converters/impl/softmax.cpp | 10 ++-----
 core/util/trt_util.cpp                      | 23 ++++++++++++++
 core/util/trt_util.h                        |  1 +
 tests/core/converters/BUILD                 |  5 ++++
 tests/core/converters/test_shuffle.cpp      | 29 ++++++++++++++++++
 8 files changed, 95 insertions(+), 8 deletions(-)
 create mode 100644 core/conversion/converters/impl/shuffle.cpp
 create mode 100644 tests/core/converters/test_shuffle.cpp

diff --git a/core/conversion/converters/BUILD b/core/conversion/converters/BUILD
index 43b30857ee..ccc72bb4cb 100644
--- a/core/conversion/converters/BUILD
+++ b/core/conversion/converters/BUILD
@@ -17,6 +17,7 @@ cc_library(
         "impl/linear.cpp",
         "impl/pooling.cpp",
         "impl/reduce.cpp",
+        "impl/shuffle.cpp",
         "impl/softmax.cpp",
         "impl/unary.cpp",
     ],
diff --git a/core/conversion/converters/converters.h b/core/conversion/converters/converters.h
index 465c952ceb..6c151b061a 100644
--- a/core/conversion/converters/converters.h
+++ b/core/conversion/converters/converters.h
@@ -6,6 +6,7 @@
 #include "torch/csrc/jit/runtime/custom_operator.h"
 #include "ATen/core/function_schema.h"
 
+#include "core/util/prelude.h"
 #include "core/conversion/conversionctx/ConversionCtx.h"
 
 namespace trtorch {
diff --git a/core/conversion/converters/impl/shuffle.cpp b/core/conversion/converters/impl/shuffle.cpp
new file mode 100644
index 0000000000..6c262b89a9
--- /dev/null
+++ b/core/conversion/converters/impl/shuffle.cpp
@@ -0,0 +1,33 @@
+#include "core/conversion/converters/converters.h"
+
+namespace trtorch {
+namespace core {
+namespace conversion {
+namespace converters {
+namespace impl {
+namespace {
+
+static auto shuffle_registrations = RegisterNodeConversionPatterns()
+    .pattern({
+        "aten::reshape(Tensor self, int[] shape) -> (Tensor)",
+        [](ConversionCtx* ctx, const torch::jit::Node* n, args& args) -> bool {
+            auto in = args[0].ITensor();
+            auto new_shape = util::toDimsPad(args[1].unwrapToIntList(), 2);
+
+            auto shuffle = ctx->net->addShuffle(*in);
+            TRTORCH_CHECK(shuffle, "Unable to create shuffle layer from node: " << *n);
+            shuffle->setReshapeDimensions(new_shape);
+            shuffle->setName(util::node_info(n).c_str());
+
+            auto out_tensor = ctx->AssociateValueAndTensor(n->outputs()[0], shuffle->getOutput(0));
+            LOG_DEBUG("Output tensor shape: " << out_tensor->getDimensions());
+
+            return true;
+        }
+    });
+} // namespace
+} // namespace impl
+} // namespace converters
+} // namespace conversion
+} // namespace core
+} // namespace trtorch
diff --git a/core/conversion/converters/impl/softmax.cpp b/core/conversion/converters/impl/softmax.cpp
index c138c759f0..35f6f04ef1 100644
--- a/core/conversion/converters/impl/softmax.cpp
+++ b/core/conversion/converters/impl/softmax.cpp
@@ -1,4 +1,3 @@
-#include "core/util/prelude.h"
 #include "core/conversion/converters/converters.h"
 
 namespace trtorch {
@@ -29,12 +28,7 @@ static auto softmax_registrations = RegisterNodeConversionPatterns()
             auto softmax = ctx->net->addSoftMax(*in);
 
             TRTORCH_CHECK(softmax, "Unable to create softmax layer from node: " << *n);
-
-            if (!softmax) {
-                LOG_ERROR("Unable to create softmax layer from node: " << *n);
-                return false;
-            }
-            LOG_WARNING("Disregarding dtype argument, please verify");
+            LOG_DEBUG("Disregarding dtype argument");
 
             if (shape.size() > 3) {
                 softmax->setAxes(1 << (dim));
@@ -69,4 +63,4 @@ static auto softmax_registrations = RegisterNodeConversionPatterns()
 } // namespace converters
 } // namespace conversion
 } // namespace core
-} // trtorch
+} // namespace trtorch
diff --git a/core/util/trt_util.cpp b/core/util/trt_util.cpp
index 89214e5efd..2f6706c51a 100644
--- a/core/util/trt_util.cpp
+++ b/core/util/trt_util.cpp
@@ -59,6 +59,29 @@ nvinfer1::Dims toDims(c10::List<int64_t> l) {
     return dims;
 }
 
+nvinfer1::Dims toDimsPad(c10::List<int64_t> l, uint64_t pad_to) {
+    if (l.size() > pad_to) {
+        LOG_DEBUG("Requested padding of dimensions to " << pad_to << " but found " << l.size() << " dimensions, not going to pad");
+        return toDims(l);
+    }
+
+    if (pad_to > nvinfer1::Dims::MAX_DIMS) {
+        //TODO: Handle this with exceptions or whatever
+        LOG_INTERNAL_ERROR("The list requested to be converted to nvinfer1::Dims exceeds the max number of dimensions for TensorRT");
+    }
+
+    nvinfer1::Dims dims;
+    dims.nbDims = pad_to;
+    for (size_t i = 0; i < pad_to - l.size(); i++) {
+        dims.d[i] = 1;
+    }
+
+    for (size_t i = pad_to - l.size(); i < pad_to; i++) {
+        dims.d[i] = l[i - (pad_to - l.size())];
+    }
+    return dims;
+}
+
 std::vector<int64_t> toVec(nvinfer1::Dims d) {
     std::vector<int64_t> dims;
     for (int i = 0; i < d.nbDims; i++) {
diff --git a/core/util/trt_util.h b/core/util/trt_util.h
index bf8ea5b224..09cf5ff418 100644
--- a/core/util/trt_util.h
+++ b/core/util/trt_util.h
@@ -78,6 +78,7 @@ namespace util {
 int64_t volume(const nvinfer1::Dims& d);
 
 nvinfer1::Dims toDimsPad(c10::IntArrayRef l, uint64_t pad_to);
+nvinfer1::Dims toDimsPad(c10::List<int64_t> l, uint64_t pad_to);
 nvinfer1::Dims toDims(c10::IntArrayRef l);
 nvinfer1::Dims toDims(c10::List<int64_t> l);
 nvinfer1::DimsHW toDimsHW(c10::List<int64_t> l);
diff --git a/tests/core/converters/BUILD b/tests/core/converters/BUILD
index 8cbe9c4e68..39d978afa9 100644
--- a/tests/core/converters/BUILD
+++ b/tests/core/converters/BUILD
@@ -4,6 +4,10 @@ converter_test(
     name = "test_softmax"
 )
 
+converter_test(
+    name = "test_shuffle"
+)
+
 converter_test(
     name = "test_activation"
 )
@@ -36,6 +40,7 @@ test_suite(
     name = "test_converters",
     tests = [
         ":test_softmax",
+        ":test_shuffle",
         ":test_activation",
         ":test_pooling",
         ":test_unary",
diff --git a/tests/core/converters/test_shuffle.cpp b/tests/core/converters/test_shuffle.cpp
new file mode 100644
index 0000000000..459d271d05
--- /dev/null
+++ b/tests/core/converters/test_shuffle.cpp
@@ -0,0 +1,29 @@
+#include <string>
+#include "gtest/gtest.h"
+#include "torch/csrc/jit/ir/irparser.h"
+#include "tests/util/util.h"
+#include "core/compiler.h"
+
+TEST(Converters, ATenReshapeConvertsCorrectly) {
+    const auto graph = R"IR(
+      graph(%0 : Tensor):
+        %1 : int = prim::Constant[value=3]()
+        %2 : int = prim::Constant[value=2]()
+        %3 : int[] = prim::ListConstruct(%1, %2)
+        %4 : Tensor = aten::reshape(%0, %3)
+        return (%4))IR";
+
+    auto g = std::make_shared<torch::jit::Graph>();
+    torch::jit::parseIR(graph, &*g);
+
+    auto in = at::randint(0, 5, {2, 3}, {at::kCUDA});
+    auto params = trtorch::core::conversion::get_named_params(g->inputs(), {});
+    auto jit_results = trtorch::tests::util::RunGraph(g, params, {in});
+
+    in = at::clone(in);
+    params = trtorch::core::conversion::get_named_params(g->inputs(), {});
+    auto trt_results = trtorch::tests::util::RunGraphEngine(g, params, {in});
+    auto trt = trt_results[0].reshape_as(jit_results[0]);
+
+    ASSERT_TRUE(trtorch::tests::util::almostEqual(jit_results[0], trt, 2e-6));
+}
\ No newline at end of file

From 90c44b9fc142ee6ba6062011e2f2b157df86d098 Mon Sep 17 00:00:00 2001
From: Naren Dasan <naren@narendasan.com>
Date: Wed, 29 Apr 2020 14:54:49 -0700
Subject: [PATCH 04/13] feat(//core/util): New logging level for Graph Dumping

Signed-off-by: Naren Dasan <naren@narendasan.com>
Signed-off-by: Naren Dasan <narens@nvidia.com>
---
 core/util/logging/TRTorchLogger.cpp | 51 +++++++++++++++--------------
 core/util/logging/TRTorchLogger.h   |  7 ++--
 core/util/macros.h                  |  7 ++--
 3 files changed, 36 insertions(+), 29 deletions(-)

diff --git a/core/util/logging/TRTorchLogger.cpp b/core/util/logging/TRTorchLogger.cpp
index d3968c9ee3..678506c09f 100644
--- a/core/util/logging/TRTorchLogger.cpp
+++ b/core/util/logging/TRTorchLogger.cpp
@@ -17,39 +17,36 @@ namespace util {
 namespace logging {
 
 TRTorchLogger::TRTorchLogger(std::string prefix, Severity severity, bool color)
-    : prefix_(prefix), reportable_severity_(severity), color_(color) {}
+    : prefix_(prefix), reportable_severity_((LogLevel) severity), color_(color) {}
 
 TRTorchLogger::TRTorchLogger(std::string prefix, LogLevel lvl, bool color)
-    : prefix_(prefix), reportable_severity_((Severity) lvl), color_(color) {}
+    : prefix_(prefix), reportable_severity_(lvl), color_(color) {}
 
 void TRTorchLogger::log(LogLevel lvl, std::string msg) {
-    Severity severity = (Severity) lvl;
-    log(severity, msg.c_str());
-}
-
-void TRTorchLogger::log(Severity severity, const char* msg) {
     // suppress messages with severity enum value greater than the reportable
-    if (severity > reportable_severity_) {
+    if (lvl > reportable_severity_) {
         return;
     }
 
     if (color_) {
-        switch (severity) {
-        case Severity::kINTERNAL_ERROR: std::cerr << TERM_RED; break;
-        case Severity::kERROR: std::cerr << TERM_RED; break;
-        case Severity::kWARNING: std::cerr << TERM_YELLOW; break;
-        case Severity::kINFO: std::cerr << TERM_GREEN; break;
-        case Severity::kVERBOSE: std::cerr << TERM_MAGENTA; break;
+        switch (lvl) {
+        case LogLevel::kINTERNAL_ERROR: std::cerr << TERM_RED; break;
+        case LogLevel::kERROR: std::cerr << TERM_RED; break;
+        case LogLevel::kWARNING: std::cerr << TERM_YELLOW; break;
+        case LogLevel::kINFO: std::cerr << TERM_GREEN; break;
+        case LogLevel::kDEBUG: std::cerr << TERM_MAGENTA; break;
+        case LogLevel::kGRAPH: std::cerr << TERM_NORMAL; break;
         default: break;
         }
     }
 
-    switch (severity) {
-    case Severity::kINTERNAL_ERROR: std::cerr << "INTERNAL_ERROR: "; break;
-    case Severity::kERROR: std::cerr << "ERROR: "; break;
-    case Severity::kWARNING: std::cerr << "WARNING: "; break;
-    case Severity::kINFO: std::cerr << "INFO: "; break;
-    case Severity::kVERBOSE: std::cerr << "DEBUG: "; break;
+    switch (lvl) {
+    case LogLevel::kINTERNAL_ERROR: std::cerr << "INTERNAL_ERROR: "; break;
+    case LogLevel::kERROR: std::cerr << "ERROR: "; break;
+    case LogLevel::kWARNING: std::cerr << "WARNING: "; break;
+    case LogLevel::kINFO: std::cerr << "INFO: "; break;
+    case LogLevel::kDEBUG: std::cerr << "DEBUG: "; break;
+    case LogLevel::kGRAPH: std::cerr << "GRAPH: "; break;
     default: std::cerr << "UNKNOWN: "; break;
     }
 
@@ -60,16 +57,22 @@ void TRTorchLogger::log(Severity severity, const char* msg) {
     std::cerr << prefix_ << msg << std::endl;
 }
 
+
+void TRTorchLogger::log(Severity severity, const char* msg) {
+    LogLevel lvl = (LogLevel) severity;
+    log(lvl, std::string(msg));
+}
+
 void TRTorchLogger::set_logging_prefix(std::string prefix) {
     prefix_ = prefix;
 }
 
 void TRTorchLogger::set_reportable_severity(Severity severity) {
-    reportable_severity_ = severity;
+    reportable_severity_ = (LogLevel) severity;
 }
 
 void TRTorchLogger::set_reportable_log_level(LogLevel lvl) {
-    reportable_severity_ = (Severity) lvl;
+    reportable_severity_ = lvl;
 }
 
 void TRTorchLogger::set_is_colored_output_on(bool colored_output_on) {
@@ -81,11 +84,11 @@ std::string TRTorchLogger::get_logging_prefix() {
 }
 
 nvinfer1::ILogger::Severity TRTorchLogger::get_reportable_severity() {
-    return reportable_severity_;
+    return (Severity) reportable_severity_;
 }
 
 LogLevel TRTorchLogger::get_reportable_log_level() {
-    return (LogLevel) reportable_severity_;
+    return reportable_severity_;
 }
 
 bool TRTorchLogger::get_is_colored_output_on() {
diff --git a/core/util/logging/TRTorchLogger.h b/core/util/logging/TRTorchLogger.h
index 0a3edc3203..55582a5277 100644
--- a/core/util/logging/TRTorchLogger.h
+++ b/core/util/logging/TRTorchLogger.h
@@ -8,12 +8,13 @@ namespace core {
 namespace util {
 namespace logging {
 
-enum LogLevel {
+enum class LogLevel : uint8_t {
     kINTERNAL_ERROR = (int) nvinfer1::ILogger::Severity::kINTERNAL_ERROR,
     kERROR = (int) nvinfer1::ILogger::Severity::kERROR,
     kWARNING = (int) nvinfer1::ILogger::Severity::kWARNING,
     kINFO = (int) nvinfer1::ILogger::Severity::kINFO,
     kDEBUG = (int) nvinfer1::ILogger::Severity::kVERBOSE,
+    kGRAPH
 };
 
 // Logger for TensorRT info/warning/errors
@@ -38,10 +39,10 @@ class TRTorchLogger : public nvinfer1::ILogger {
 
 private:
     std::string prefix_;
-    Severity reportable_severity_;
+    LogLevel reportable_severity_;
     bool color_;
 };
-    
+
 TRTorchLogger& get_logger();
 
 } // namespace logging
diff --git a/core/util/macros.h b/core/util/macros.h
index 2c295988d1..7a1d1b4540 100644
--- a/core/util/macros.h
+++ b/core/util/macros.h
@@ -11,18 +11,21 @@
         l.log(sev, ss.str());                  \
     } while (0)
 
+#define GRAPH_DUMP_GLOBAL(s)         TRTORCH_LOG(core::util::logging::get_logger(), core::util::logging::LogLevel::kGRAPH, s)
 #define LOG_DEBUG_GLOBAL(s)          TRTORCH_LOG(core::util::logging::get_logger(), core::util::logging::LogLevel::kDEBUG, s)
 #define LOG_INFO_GLOBAL(s)           TRTORCH_LOG(core::util::logging::get_logger(), core::util::logging::LogLevel::kINFO, s)
 #define LOG_WARNING_GLOBAL(s)        TRTORCH_LOG(core::util::logging::get_logger(), core::util::logging::LogLevel::kWARNING, s)
 #define LOG_ERROR_GLOBAL(s)          TRTORCH_LOG(core::util::logging::get_logger(), core::util::logging::LogLevel::kERROR, s)
-#define LOG_INTERNAL_ERROR_GLOBAL(s) TRTORCH_LOG(core::util::logging::get_logger(), core::util::logging::LogLevel::kINTERNAL_ERROR, s) 
+#define LOG_INTERNAL_ERROR_GLOBAL(s) TRTORCH_LOG(core::util::logging::get_logger(), core::util::logging::LogLevel::kINTERNAL_ERROR, s)
 
+#define GRAPH_DUMP_OWN(l,s)          TRTORCH_LOG(l, core::util::logging::LogLevel::kGRAPH, s)
 #define LOG_DEBUG_OWN(l,s)           TRTORCH_LOG(l, core::util::logging::LogLevel::kDEBUG, s)
 #define LOG_INFO_OWN(l,s)            TRTORCH_LOG(l, core::util::logging::LogLevel::kINFO, s)
 #define LOG_WARNING_OWN(l,s)         TRTORCH_LOG(l, core::util::logging::LogLevel::kWARNING, s)
 #define LOG_ERROR_OWN(l,s)           TRTORCH_LOG(l, core::util::logging::LogLevel::kERROR, s)
 #define LOG_INTERNAL_ERROR_OWN(l,s)  TRTORCH_LOG(l, core::util::logging::LogLevel::kINTERNAL_ERROR, s)
 
+#define GRAPH_DUMP(...)         GET_MACRO(__VA_ARGS__, GRAPH_DUMP_OWN, GRAPH_DUMP_GLOBAL)(__VA_ARGS__)
 #define LOG_DEBUG(...)          GET_MACRO(__VA_ARGS__, LOG_DEBUG_OWN, LOG_DEBUG_GLOBAL)(__VA_ARGS__)
 #define LOG_INFO(...)           GET_MACRO(__VA_ARGS__, LOG_INFO_OWN, LOG_INFO_GLOBAL)(__VA_ARGS__)
 #define LOG_WARNING(...)        GET_MACRO(__VA_ARGS__, LOG_WARNING_OWN, LOG_WARNING_GLOBAL)(__VA_ARGS__)
@@ -47,7 +50,7 @@
            << ", consider filing a bug: https://www.github.com/NVIDIA/TRTorch/issues\n" \
            << __VA_ARGS__);                                                             \
      }
- 
+
 #define TRTORCH_CHECK(cond, ...)                                            \
   if (!(cond)) {                                                            \
       TRTORCH_THROW_ERROR("Expected " << #cond                              \

From 4acc3fddc8b895553422f82129c66fb7bd147064 Mon Sep 17 00:00:00 2001
From: Naren Dasan <naren@narendasan.com>
Date: Thu, 30 Apr 2020 21:30:52 -0700
Subject: [PATCH 05/13] feat(//core/lowering): New freeze model pass and new
 exception elimination pass

Signed-off-by: Naren Dasan <naren@narendasan.com>
Signed-off-by: Naren Dasan <narens@nvidia.com>
---
 core/conversion/converters/impl/linear.cpp    |  7 ++
 core/lowering/BUILD                           |  3 +-
 core/lowering/lowering.cpp                    | 35 +++++---
 core/lowering/lowering.h                      |  2 +-
 core/lowering/{irfusers => passes}/BUILD      |  8 +-
 .../lowering/passes/exception_elimination.cpp | 86 +++++++++++++++++++
 .../expand_log_softmax.cpp                    | 10 +--
 .../fuse_flatten_linear.cpp                   | 40 ++++++++-
 .../{irfusers/irfusers.h => passes/passes.h}  |  3 +-
 .../{irfusers => passes}/remove_dropout.cpp   |  6 +-
 .../unpack_batch_norm.cpp                     |  4 +-
 core/util/logging/TRTorchLogger.cpp           |  2 +-
 core/util/macros.h                            |  6 +-
 tests/modules/hub.py                          |  6 +-
 tests/modules/test_compiled_modules.cpp       |  6 +-
 tests/modules/test_modules_as_engines.cpp     |  6 +-
 .../test_multiple_registered_engines.cpp      |  4 +-
 17 files changed, 188 insertions(+), 46 deletions(-)
 rename core/lowering/{irfusers => passes}/BUILD (74%)
 create mode 100644 core/lowering/passes/exception_elimination.cpp
 rename core/lowering/{irfusers => passes}/expand_log_softmax.cpp (95%)
 rename core/lowering/{irfusers => passes}/fuse_flatten_linear.cpp (52%)
 rename core/lowering/{irfusers/irfusers.h => passes/passes.h} (82%)
 rename core/lowering/{irfusers => passes}/remove_dropout.cpp (93%)
 rename core/lowering/{irfusers => passes}/unpack_batch_norm.cpp (98%)

diff --git a/core/conversion/converters/impl/linear.cpp b/core/conversion/converters/impl/linear.cpp
index 0b5b4e957a..faca5c3edd 100644
--- a/core/conversion/converters/impl/linear.cpp
+++ b/core/conversion/converters/impl/linear.cpp
@@ -9,6 +9,13 @@ namespace impl {
 namespace {
 
 auto linear_registrations = RegisterNodeConversionPatterns()
+    // .pattern({
+    //     "aten::addmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> (Tensor)",
+    //     [](ConversionCtx* ctx, const torch::jit::Node* n, args& args) -> {
+    //         auto in = args[0].ITensor();
+
+    //     }
+    // })
     .pattern({
         "aten::linear(Tensor input, Tensor weight, Tensor? bias = None) -> (Tensor)",
         [](ConversionCtx* ctx, const torch::jit::Node* n, args& args) -> bool {
diff --git a/core/lowering/BUILD b/core/lowering/BUILD
index 9a0c53c13b..4cb1978251 100644
--- a/core/lowering/BUILD
+++ b/core/lowering/BUILD
@@ -11,7 +11,8 @@ cc_library(
     ],
     deps = [
         "@libtorch//:libtorch",
-        "//core/lowering/irfusers"
+        "//core/lowering/passes",
+        "//core/util:prelude"
     ]
 )
 
diff --git a/core/lowering/lowering.cpp b/core/lowering/lowering.cpp
index d18a9d612d..53ff6d36d8 100644
--- a/core/lowering/lowering.cpp
+++ b/core/lowering/lowering.cpp
@@ -1,10 +1,13 @@
 #include "torch/csrc/jit/passes/dead_code_elimination.h"
 #include "torch/csrc/jit/passes/fuse_linear.h"
+#include "torch/csrc/jit/passes/freeze_module.h"
 #include "torch/csrc/jit/passes/lower_graph.h"
 #include "torch/csrc/jit/passes/quantization.h"
+#include "torch/csrc/jit/passes/guard_elimination.h"
 
+#include "core/util/prelude.h"
 #include "core/lowering/lowering.h"
-#include "core/lowering/irfusers/irfusers.h"
+#include "core/lowering/passes/passes.h"
 
 namespace trtorch {
 namespace core {
@@ -17,30 +20,36 @@ void LowerBlock(torch::jit::Block* b) {
 }
 
 void LowerGraph(std::shared_ptr<torch::jit::Graph>& g) {
+    torch::jit::EliminateRedundantGuards(g);
+    passes::EliminateExceptionOrPassPattern(g);
     torch::jit::FuseLinear(g);
-    irfusers::RemoveDropout(g);
-    irfusers::FuseFlattenLinear(g);
-    irfusers::ExpandLogSoftmax(g);
+    passes::RemoveDropout(g);
+    passes::FuseFlattenLinear(g);
+    passes::ExpandLogSoftmax(g);
+    //passes::RemoveDimExeception(g);
     //irfusers::UnpackBatchNorm(g);
-    //torch::jit::EliminateDeadCode(g);
+    torch::jit::EliminateDeadCode(g);
+    LOG_GRAPH(*g);
 }
 
-void LowerModule(const torch::jit::script::Module& mod) {
-    torch::jit::FoldConvBatchNorm2d(mod);
+torch::jit::Module LowerModule(const torch::jit::script::Module& mod) {
+    auto mod_ = torch::jit::freeze_module(mod);
+    return mod_;
 }
 
 std::pair<std::shared_ptr<torch::jit::Graph>, std::vector<at::Tensor>> Lower(const torch::jit::script::Module& mod,
                                                                             std::string method_name) {
-    LowerModule(mod);
-    auto g = mod.get_method(method_name).graph();
-    // Go through PyTorch Lowering to simplify graph and extract weight parameters
-    auto graph_and_parameters = torch::jit::LowerGraph(*g, mod._ivalue());
-
-    g = graph_and_parameters.first;
+    auto lowered_mod = LowerModule(mod);
+    auto g = lowered_mod.get_method(method_name).graph();
+    LOG_GRAPH(*g);
 
     // Go through TRTorch Lowering to reformat graph to be conversion friendly
     // and also segment for accelerators and executors (TRT-DLA, TRT-GPU, PYT)
+    LOG_GRAPH("TRTorch Graph Lowering");
     lowering::LowerGraph(g);
+    //=[torch::jit::FoldConvBatchNorm2d(lowered_mod);
+    LOG_GRAPH("LibTorch Lowering");
+    auto graph_and_parameters = torch::jit::LowerGraph(*g, lowered_mod._ivalue());
     // Is this necessary?
     lowering::LowerBlock(g->block());
     return graph_and_parameters;
diff --git a/core/lowering/lowering.h b/core/lowering/lowering.h
index 8ee2cdde53..79f07cb5ec 100644
--- a/core/lowering/lowering.h
+++ b/core/lowering/lowering.h
@@ -8,7 +8,7 @@ namespace lowering {
 
 void LowerBlock(torch::jit::Block* b);
 void LowerGraph(std::shared_ptr<torch::jit::Graph>& g);
-void LowerModule(const torch::jit::script::Module& mod);
+torch::jit::Module LowerModule(const torch::jit::script::Module& mod);
 std::pair<std::shared_ptr<torch::jit::Graph>, std::vector<at::Tensor>> Lower(const torch::jit::script::Module& mod,
                                                                              std::string method_name);
 
diff --git a/core/lowering/irfusers/BUILD b/core/lowering/passes/BUILD
similarity index 74%
rename from core/lowering/irfusers/BUILD
rename to core/lowering/passes/BUILD
index 71899dfee6..0de638898c 100644
--- a/core/lowering/irfusers/BUILD
+++ b/core/lowering/passes/BUILD
@@ -1,17 +1,19 @@
 package(default_visibility = ["//visibility:public"])
 
 cc_library(
-    name = "irfusers",
+    name = "passes",
     hdrs = [
-        "irfusers.h",
+        "passes.h",
     ],
     srcs = [
         "fuse_flatten_linear.cpp",
         "expand_log_softmax.cpp",
         "remove_dropout.cpp",
-        "unpack_batch_norm.cpp"
+        "unpack_batch_norm.cpp",
+        "exception_elimination.cpp"
     ],
     deps = [
+        "//core/util:prelude",
         "@libtorch//:libtorch",
     ]
 )
diff --git a/core/lowering/passes/exception_elimination.cpp b/core/lowering/passes/exception_elimination.cpp
new file mode 100644
index 0000000000..54f36a833a
--- /dev/null
+++ b/core/lowering/passes/exception_elimination.cpp
@@ -0,0 +1,86 @@
+#include "torch/csrc/jit/passes/guard_elimination.h"
+#include "torch/csrc/jit/ir/alias_analysis.h"
+#include "torch/csrc/jit/jit_log.h"
+#include "torch/csrc/jit/passes/constant_propagation.h"
+#include "torch/csrc/jit/passes/peephole.h"
+#include "torch/csrc/jit/runtime/graph_executor.h"
+#include "torch/csrc/jit/passes/dead_code_elimination.h"
+
+#include "core/util/prelude.h"
+
+#include <vector>
+
+namespace trtorch {
+namespace core {
+namespace lowering {
+namespace passes {
+namespace {
+using namespace torch::jit;
+struct ExceptionOrPassPatternElimination {
+  ExceptionOrPassPatternElimination(std::shared_ptr<Graph> graph)
+    : graph_(std::move(graph)) {}
+
+  void run() {
+    LOG_GRAPH("Pre exeception or pass elimination: " << *graph_);
+    findExceptionOrPassNodes(graph_->block());
+    torch::jit::EliminateDeadCode(graph_);
+    LOG_GRAPH("Post exeception or pass elimination: " << *graph_);
+  }
+
+private:
+  bool isExceptionOrPassNode(Node* n) {
+    /// Check if this Node hosts a pattern like so:
+    ///  = prim::If(%5958)
+    ///   block0():
+    ///     = prim::RaiseException(%45)
+    ///    -> ()
+    ///   block1():
+    ///    -> ()
+    if (n->blocks().size() != 2) {
+      return false;
+    }
+    auto arm1 = n->blocks()[0];
+    auto arm2 = n->blocks()[1];
+    if (arm1->outputs().size() != 0 || arm2->outputs().size() != 0) {
+      // Make sure that the node doesn't actually produce any Value that are used by other nodes
+      return false;
+    }
+
+    auto arm1_start = arm1->nodes().begin();
+
+    if ((*arm1_start)->kind() != prim::RaiseException && (*(++arm1_start))->kind() != prim::Return) {
+      // Make sure that block0 is solely just the exception and the return
+      return false;
+    }
+
+    if ((*(arm2->nodes().begin()))->kind() != prim::Return) {
+      // Make sure that block1 is solely the return
+      return false;
+    }
+
+    return true;
+  }
+
+  void findExceptionOrPassNodes(Block* b) {
+    for (auto it = b->nodes().begin(); it != b->nodes().end(); it++) {
+      auto n = *it;
+      if (n->kind() == prim::If && isExceptionOrPassNode(n)) {
+        LOG_GRAPH("Found that node " << *n << "  is an exception or pass node (EliminateChecks)");
+        it.destroyCurrent();
+      }
+    }
+  }
+
+  std::shared_ptr<Graph> graph_;
+};
+} // namespace
+
+void EliminateExceptionOrPassPattern(std::shared_ptr<Graph> graph) {
+  ExceptionOrPassPatternElimination eppe(std::move(graph));
+  eppe.run();
+}
+
+} // namespace passes
+} // namespace lowering
+} // namespace core
+} // namespace trtorch
diff --git a/core/lowering/irfusers/expand_log_softmax.cpp b/core/lowering/passes/expand_log_softmax.cpp
similarity index 95%
rename from core/lowering/irfusers/expand_log_softmax.cpp
rename to core/lowering/passes/expand_log_softmax.cpp
index c9c3cc27ce..37bbcdd321 100644
--- a/core/lowering/irfusers/expand_log_softmax.cpp
+++ b/core/lowering/passes/expand_log_softmax.cpp
@@ -4,14 +4,14 @@
 namespace trtorch {
 namespace core {
 namespace lowering {
-namespace irfusers {
+namespace passes {
 
 void ExpandLogSoftmax(std::shared_ptr<torch::jit::Graph>& graph) {
     // Its easier for TensorRT if we seperate softmax and log
     // There might need to be a reshape inserted see:
     // https://github.com/onnx/onnx-tensorrt/blob/5dca8737851118f6ab8a33ea1f7bcb7c9f06caf5/builtin_op_importers.cpp#L1593
     // Should the reshapes be added here or in the converter?
-    
+
     // TODO: In the future this should be removed for a deicated log_softmax converter (more efficent)
     // But its easier to stand up a working system if the number of op converters is lower
     std::string logsoftmax_pattern = R"IR(
@@ -33,19 +33,19 @@ void ExpandLogSoftmax(std::shared_ptr<torch::jit::Graph>& graph) {
             %dtype : int? = prim::Constant()
             %softmax = aten::softmax(%input, %dim, %dtype)
             %log_softmax = aten::log(%softmax)
-            return (%log_softmax))IR";   
+            return (%log_softmax))IR";
 
     torch::jit::SubgraphRewriter logsoftmax_to_softmax_log;
     logsoftmax_to_softmax_log.RegisterRewritePattern(logsoftmax_pattern, softmax_log_pattern);
     logsoftmax_to_softmax_log.runOnGraph(graph);
-    
+
     torch::jit::SubgraphRewriter logsoftmax_none_to_softmax_log_none;
     logsoftmax_none_to_softmax_log_none.RegisterRewritePattern(
         logsoftmax_none_pattern, softmax_log_none_pattern);
     logsoftmax_none_to_softmax_log_none.runOnGraph(graph);
 }
 
-} // namespace irfusers
+} // namespace passes
 } // namespace lowering
 } // namespace core
 } // namespace trtorch
diff --git a/core/lowering/irfusers/fuse_flatten_linear.cpp b/core/lowering/passes/fuse_flatten_linear.cpp
similarity index 52%
rename from core/lowering/irfusers/fuse_flatten_linear.cpp
rename to core/lowering/passes/fuse_flatten_linear.cpp
index 5b8c3899ec..6dc8ebf68e 100644
--- a/core/lowering/irfusers/fuse_flatten_linear.cpp
+++ b/core/lowering/passes/fuse_flatten_linear.cpp
@@ -4,7 +4,7 @@
 namespace trtorch {
 namespace core {
 namespace lowering {
-namespace irfusers {
+namespace passes {
 
 void FuseFlattenLinear(std::shared_ptr<torch::jit::Graph>& graph) {
     //TensorRT implicitly adds a flatten layer infront of FC layers if necessary
@@ -33,13 +33,47 @@ void FuseFlattenLinear(std::shared_ptr<torch::jit::Graph>& graph) {
     torch::jit::SubgraphRewriter flatten_linear_to_linear;
     flatten_linear_to_linear.RegisterRewritePattern(flatten_linear_pattern, fused_linear);
     flatten_linear_to_linear.runOnGraph(graph);
-    
+
+    torch::jit::SubgraphRewriter flatten_linear_bias_none_to_linear;
+    flatten_linear_bias_none_to_linear.RegisterRewritePattern(
+        flatten_linear_bias_none_pattern, fused_linear_bias_none);
+    flatten_linear_bias_none_to_linear.runOnGraph(graph);
+}
+
+void FuseFlattenAddMM(std::shared_ptr<torch::jit::Graph>& graph) {
+    //TensorRT implicitly adds a flatten layer infront of FC layers if necessary
+    std::string flatten_linear_pattern = R"IR(
+        graph(%input, %6, %7, %weight, %bias):
+            %flat = aten::flatten(%input, %6, %7)
+            %res = aten::linear(%flat, %weight, %bias)
+            return (%res))IR";
+    std::string flatten_linear_bias_none_pattern = R"IR(
+        graph(%input, %6, %7, %weight):
+            %flat = aten::flatten(%input, %6, %7)
+            %bias: Tensor? = prim::Constant()
+            %res = aten::linear(%flat, %weight, %bias)
+            return (%res))IR";
+    std::string fused_linear = R"IR(
+        graph(%input, %6, %7, %weight, %bias):
+            %res = aten::linear(%input, %weight, %bias)
+            return (%res))IR";
+
+    std::string fused_linear_bias_none = R"IR(
+        graph(%input, %6, %7, %weight):
+            %bias: Tensor? = prim::Constant()
+            %res = aten::linear(%input, %weight, %bias)
+            return (%res))IR";
+
+    torch::jit::SubgraphRewriter flatten_linear_to_linear;
+    flatten_linear_to_linear.RegisterRewritePattern(flatten_linear_pattern, fused_linear);
+    flatten_linear_to_linear.runOnGraph(graph);
+
     torch::jit::SubgraphRewriter flatten_linear_bias_none_to_linear;
     flatten_linear_bias_none_to_linear.RegisterRewritePattern(
         flatten_linear_bias_none_pattern, fused_linear_bias_none);
     flatten_linear_bias_none_to_linear.runOnGraph(graph);
 }
-} // namespace irfusers
+} // namespace passes
 } // namespace lowering
 } // namespace core
 } // namespace trtorch
diff --git a/core/lowering/irfusers/irfusers.h b/core/lowering/passes/passes.h
similarity index 82%
rename from core/lowering/irfusers/irfusers.h
rename to core/lowering/passes/passes.h
index c48f5d112d..1ef02ac065 100644
--- a/core/lowering/irfusers/irfusers.h
+++ b/core/lowering/passes/passes.h
@@ -5,12 +5,13 @@
 namespace trtorch {
 namespace core {
 namespace lowering {
-namespace irfusers {
+namespace passes {
 
 void FuseFlattenLinear(std::shared_ptr<torch::jit::Graph>& graph);
 void ExpandLogSoftmax(std::shared_ptr<torch::jit::Graph>& graph);
 void RemoveDropout(std::shared_ptr<torch::jit::Graph>& graph);
 void UnpackBatchNorm(std::shared_ptr<torch::jit::Graph>& graph);
+void EliminateExceptionOrPassPattern(std::shared_ptr<torch::jit::Graph> graph);
 
 } // namespace irfusers
 } // namespace lowering
diff --git a/core/lowering/irfusers/remove_dropout.cpp b/core/lowering/passes/remove_dropout.cpp
similarity index 93%
rename from core/lowering/irfusers/remove_dropout.cpp
rename to core/lowering/passes/remove_dropout.cpp
index fcd1d07aa0..b28fe011a8 100644
--- a/core/lowering/irfusers/remove_dropout.cpp
+++ b/core/lowering/passes/remove_dropout.cpp
@@ -4,7 +4,7 @@
 namespace trtorch {
 namespace core {
 namespace lowering {
-namespace irfusers {
+namespace passes {
 
 void RemoveDropout(std::shared_ptr<torch::jit::Graph>& graph) {
     std::string dropout_pattern = R"IR(
@@ -14,7 +14,7 @@ void RemoveDropout(std::shared_ptr<torch::jit::Graph>& graph) {
     std::string no_dropout_pattern = R"IR(
         graph(%input, %4, %5):
             return (%input))IR";
-    
+
     // replace matmul + add pattern to linear
     torch::jit::SubgraphRewriter remove_dropout;
     remove_dropout.RegisterRewritePattern(
@@ -22,7 +22,7 @@ void RemoveDropout(std::shared_ptr<torch::jit::Graph>& graph) {
     remove_dropout.runOnGraph(graph);
 }
 
-} // namespace irfusers
+} // namespace passes
 } // namespace lowering
 } // namespace core
 } // namespace trtorch
diff --git a/core/lowering/irfusers/unpack_batch_norm.cpp b/core/lowering/passes/unpack_batch_norm.cpp
similarity index 98%
rename from core/lowering/irfusers/unpack_batch_norm.cpp
rename to core/lowering/passes/unpack_batch_norm.cpp
index 8a10d747f8..b75af2c2f9 100644
--- a/core/lowering/irfusers/unpack_batch_norm.cpp
+++ b/core/lowering/passes/unpack_batch_norm.cpp
@@ -23,7 +23,7 @@ RegisterOperators trt_const_op_reg({
 namespace trtorch {
 namespace core {
 namespace lowering {
-namespace irfusers {
+namespace passes {
 
 // // May be abusing aten::_tensor_to_list(Tensor self) -> int[]
 // // Treating it as an emit_constant by the converters
@@ -60,7 +60,7 @@ void UnpackBatchNorm(std::shared_ptr<torch::jit::Graph>& graph) {
     unpack_batch_norm.RegisterRewritePattern(batch_norm_pattern, expanded_batch_norm_pattern);
     unpack_batch_norm.runOnGraph(graph);
 }
-} // Namespace Irfusers
+} // Namespace passes
 } // namespace lowering
 } // namespace core
 } // namespace trtorch
diff --git a/core/util/logging/TRTorchLogger.cpp b/core/util/logging/TRTorchLogger.cpp
index 678506c09f..7c80152076 100644
--- a/core/util/logging/TRTorchLogger.cpp
+++ b/core/util/logging/TRTorchLogger.cpp
@@ -101,7 +101,7 @@ namespace {
 TRTorchLogger& get_global_logger() {
     #ifndef NDEBUG
     static TRTorchLogger global_logger("[TRTorch - Debug Build] - ",
-                                       LogLevel::kDEBUG,
+                                       LogLevel::kGRAPH,
                                        true);
     #else
     static TRTorchLogger global_logger("[TRTorch] - ",
diff --git a/core/util/macros.h b/core/util/macros.h
index 7a1d1b4540..ccea57e513 100644
--- a/core/util/macros.h
+++ b/core/util/macros.h
@@ -11,21 +11,21 @@
         l.log(sev, ss.str());                  \
     } while (0)
 
-#define GRAPH_DUMP_GLOBAL(s)         TRTORCH_LOG(core::util::logging::get_logger(), core::util::logging::LogLevel::kGRAPH, s)
+#define LOG_GRAPH_GLOBAL(s)         TRTORCH_LOG(core::util::logging::get_logger(), core::util::logging::LogLevel::kGRAPH, s)
 #define LOG_DEBUG_GLOBAL(s)          TRTORCH_LOG(core::util::logging::get_logger(), core::util::logging::LogLevel::kDEBUG, s)
 #define LOG_INFO_GLOBAL(s)           TRTORCH_LOG(core::util::logging::get_logger(), core::util::logging::LogLevel::kINFO, s)
 #define LOG_WARNING_GLOBAL(s)        TRTORCH_LOG(core::util::logging::get_logger(), core::util::logging::LogLevel::kWARNING, s)
 #define LOG_ERROR_GLOBAL(s)          TRTORCH_LOG(core::util::logging::get_logger(), core::util::logging::LogLevel::kERROR, s)
 #define LOG_INTERNAL_ERROR_GLOBAL(s) TRTORCH_LOG(core::util::logging::get_logger(), core::util::logging::LogLevel::kINTERNAL_ERROR, s)
 
-#define GRAPH_DUMP_OWN(l,s)          TRTORCH_LOG(l, core::util::logging::LogLevel::kGRAPH, s)
+#define LOG_GRAPH_OWN(l,s)          TRTORCH_LOG(l, core::util::logging::LogLevel::kGRAPH, s)
 #define LOG_DEBUG_OWN(l,s)           TRTORCH_LOG(l, core::util::logging::LogLevel::kDEBUG, s)
 #define LOG_INFO_OWN(l,s)            TRTORCH_LOG(l, core::util::logging::LogLevel::kINFO, s)
 #define LOG_WARNING_OWN(l,s)         TRTORCH_LOG(l, core::util::logging::LogLevel::kWARNING, s)
 #define LOG_ERROR_OWN(l,s)           TRTORCH_LOG(l, core::util::logging::LogLevel::kERROR, s)
 #define LOG_INTERNAL_ERROR_OWN(l,s)  TRTORCH_LOG(l, core::util::logging::LogLevel::kINTERNAL_ERROR, s)
 
-#define GRAPH_DUMP(...)         GET_MACRO(__VA_ARGS__, GRAPH_DUMP_OWN, GRAPH_DUMP_GLOBAL)(__VA_ARGS__)
+#define LOG_GRAPH(...)         GET_MACRO(__VA_ARGS__, LOG_GRAPH_OWN, LOG_GRAPH_GLOBAL)(__VA_ARGS__)
 #define LOG_DEBUG(...)          GET_MACRO(__VA_ARGS__, LOG_DEBUG_OWN, LOG_DEBUG_GLOBAL)(__VA_ARGS__)
 #define LOG_INFO(...)           GET_MACRO(__VA_ARGS__, LOG_INFO_OWN, LOG_INFO_GLOBAL)(__VA_ARGS__)
 #define LOG_WARNING(...)        GET_MACRO(__VA_ARGS__, LOG_WARNING_OWN, LOG_WARNING_GLOBAL)(__VA_ARGS__)
diff --git a/tests/modules/hub.py b/tests/modules/hub.py
index b1b3a1a102..b3dc394478 100644
--- a/tests/modules/hub.py
+++ b/tests/modules/hub.py
@@ -20,5 +20,7 @@
     print("Downloading {}".format(n))
     m = m.eval().cuda()
     x = torch.ones((1, 3, 224, 224)).cuda()
-    jit_model = torch.jit.trace(m, x)
-    torch.jit.save(jit_model, n + '.jit.pt')
+    trace_model = torch.jit.trace(m, x)
+    torch.jit.save(trace_model, n + '_traced.jit.pt')
+    script_model = torch.jit.script(m)
+    torch.jit.save(script_model, n + '_scripted.jit.pt')
\ No newline at end of file
diff --git a/tests/modules/test_compiled_modules.cpp b/tests/modules/test_compiled_modules.cpp
index 199e1d81b5..9a5c9daf1d 100644
--- a/tests/modules/test_compiled_modules.cpp
+++ b/tests/modules/test_compiled_modules.cpp
@@ -28,9 +28,9 @@ TEST_P(ModuleTests, CompiledModuleIsClose) {
 INSTANTIATE_TEST_SUITE_P(CompiledModuleForwardIsCloseSuite,
                          ModuleTests,
                          testing::Values(
-                            PathAndInSize({"tests/modules/resnet18.jit.pt",
+                            PathAndInSize({"tests/modules/resnet18_traced.jit.pt",
                                           {{1,3,224,224}}}),
-                            PathAndInSize({"tests/modules/resnet50.jit.pt",
+                            PathAndInSize({"tests/modules/resnet50_traced.jit.pt",
                                           {{1,3,224,224}}}),
-                            PathAndInSize({"tests/modules/mobilenet_v2.jit.pt",
+                            PathAndInSize({"tests/modules/mobilenet_v2_traced.jit.pt",
                                           {{1,3,224,224}}})));
diff --git a/tests/modules/test_modules_as_engines.cpp b/tests/modules/test_modules_as_engines.cpp
index 759ed136c9..d190251bb3 100644
--- a/tests/modules/test_modules_as_engines.cpp
+++ b/tests/modules/test_modules_as_engines.cpp
@@ -19,9 +19,9 @@ TEST_P(ModuleTests, ModuleAsEngineIsClose) {
 INSTANTIATE_TEST_SUITE_P(ModuleAsEngineForwardIsCloseSuite,
                          ModuleTests,
                          testing::Values(
-                            PathAndInSize({"tests/modules/resnet18.jit.pt",
+                            PathAndInSize({"tests/modules/resnet18_traced.jit.pt",
                                           {{1,3,224,224}}}),
-                            PathAndInSize({"tests/modules/resnet50.jit.pt",
+                            PathAndInSize({"tests/modules/resnet50_traced.jit.pt",
                                           {{1,3,224,224}}}),
-                            PathAndInSize({"tests/modules/mobilenet_v2.jit.pt",
+                            PathAndInSize({"tests/modules/mobilenet_v2_traced.jit.pt",
                                           {{1,3,224,224}}})));
\ No newline at end of file
diff --git a/tests/modules/test_multiple_registered_engines.cpp b/tests/modules/test_multiple_registered_engines.cpp
index 7ce3dbf61f..c03e68c0b4 100644
--- a/tests/modules/test_multiple_registered_engines.cpp
+++ b/tests/modules/test_multiple_registered_engines.cpp
@@ -8,8 +8,8 @@ TEST(ModuleTests, CanRunMultipleEngines) {
     torch::jit::script::Module mod1;
     torch::jit::script::Module mod2;
     try {
-        mod1 = torch::jit::load("tests/modules/resnet50.jit.pt");
-        mod2 = torch::jit::load("tests/modules/resnet18.jit.pt");
+        mod1 = torch::jit::load("tests/modules/resnet50_traced.jit.pt");
+        mod2 = torch::jit::load("tests/modules/resnet18_traced.jit.pt");
     }
     catch (const c10::Error& e) {
         std::cerr << "error loading the model\n";

From d945eb933b83823f45ca79d907d2285ebee67cfb Mon Sep 17 00:00:00 2001
From: Naren Dasan <naren@narendasan.com>
Date: Fri, 1 May 2020 15:42:59 -0700
Subject: [PATCH 06/13] feat(aten::flatten): Adds a converter for aten flatten
 since MM is the preferred path now

Signed-off-by: Naren Dasan <naren@narendasan.com>
Signed-off-by: Naren Dasan <narens@nvidia.com>
---
 core/conversion/converters/impl/shuffle.cpp | 54 ++++++++++++++-------
 tests/core/converters/test_shuffle.cpp      | 48 ++++++++++++++++++
 2 files changed, 85 insertions(+), 17 deletions(-)

diff --git a/core/conversion/converters/impl/shuffle.cpp b/core/conversion/converters/impl/shuffle.cpp
index 6c262b89a9..f775f5790b 100644
--- a/core/conversion/converters/impl/shuffle.cpp
+++ b/core/conversion/converters/impl/shuffle.cpp
@@ -1,5 +1,7 @@
 #include "core/conversion/converters/converters.h"
 
+#include "torch/torch.h"
+
 namespace trtorch {
 namespace core {
 namespace conversion {
@@ -8,23 +10,41 @@ namespace impl {
 namespace {
 
 static auto shuffle_registrations = RegisterNodeConversionPatterns()
-    .pattern({
-        "aten::reshape(Tensor self, int[] shape) -> (Tensor)",
-        [](ConversionCtx* ctx, const torch::jit::Node* n, args& args) -> bool {
-            auto in = args[0].ITensor();
-            auto new_shape = util::toDimsPad(args[1].unwrapToIntList(), 2);
-
-            auto shuffle = ctx->net->addShuffle(*in);
-            TRTORCH_CHECK(shuffle, "Unable to create shuffle layer from node: " << *n);
-            shuffle->setReshapeDimensions(new_shape);
-            shuffle->setName(util::node_info(n).c_str());
-
-            auto out_tensor = ctx->AssociateValueAndTensor(n->outputs()[0], shuffle->getOutput(0));
-            LOG_DEBUG("Output tensor shape: " << out_tensor->getDimensions());
-
-            return true;
-        }
-    });
+  .pattern({
+    "aten::flatten.using_ints(Tensor self, int start_dim=0, int end_dim=-1) -> (Tensor)",
+    [](ConversionCtx* ctx, const torch::jit::Node* n, args& args) -> bool {
+      auto in = args[0].ITensor();
+      auto start_dim = args[1].unwrapToInt();
+      auto end_dim = args[2].unwrapToInt();
+      auto in_shape = util::toVec(in->getDimensions());
+      auto out_shape = torch::flatten(torch::rand(in_shape), start_dim, end_dim).sizes();
+
+      auto shuffle = ctx->net->addShuffle(*in);
+      TRTORCH_CHECK(shuffle, "Unable to create shuffle layer from node: " << *n);
+      shuffle->setReshapeDimensions(util::toDims(out_shape));
+      shuffle->setName(util::node_info(n).c_str());
+
+      auto out_tensor = ctx->AssociateValueAndTensor(n->outputs()[0], shuffle->getOutput(0));
+      LOG_DEBUG("Output tensor shape: " << out_tensor->getDimensions());
+      return true;
+    }
+  }).pattern({
+    "aten::reshape(Tensor self, int[] shape) -> (Tensor)",
+    [](ConversionCtx* ctx, const torch::jit::Node* n, args& args) -> bool {
+      auto in = args[0].ITensor();
+      auto new_shape = util::toDimsPad(args[1].unwrapToIntList(), 2);
+
+      auto shuffle = ctx->net->addShuffle(*in);
+      TRTORCH_CHECK(shuffle, "Unable to create shuffle layer from node: " << *n);
+      shuffle->setReshapeDimensions(new_shape);
+      shuffle->setName(util::node_info(n).c_str());
+
+      auto out_tensor = ctx->AssociateValueAndTensor(n->outputs()[0], shuffle->getOutput(0));
+      LOG_DEBUG("Output tensor shape: " << out_tensor->getDimensions());
+
+      return true;
+    }
+  });
 } // namespace
 } // namespace impl
 } // namespace converters
diff --git a/tests/core/converters/test_shuffle.cpp b/tests/core/converters/test_shuffle.cpp
index 459d271d05..7539dfcf39 100644
--- a/tests/core/converters/test_shuffle.cpp
+++ b/tests/core/converters/test_shuffle.cpp
@@ -4,6 +4,54 @@
 #include "tests/util/util.h"
 #include "core/compiler.h"
 
+// TODO: IR Parser doesnt work well with neg numbers
+TEST(Converters, ATenFlattenConvertsCorrectly) {
+    const auto graph = R"IR(
+      graph(%0 : Tensor):
+        %1 : int = prim::Constant[value=0]()
+        %2 : int = prim::Constant[value=1]()
+        %3 : Tensor = aten::flatten(%0, %1, %2)
+        return (%3))IR";
+
+    auto g = std::make_shared<torch::jit::Graph>();
+    torch::jit::parseIR(graph, &*g);
+
+    auto in = at::randint(0, 5, {2, 3}, {at::kCUDA});
+    auto params = trtorch::core::conversion::get_named_params(g->inputs(), {});
+    auto jit_results = trtorch::tests::util::RunGraph(g, params, {in});
+
+    in = at::clone(in);
+    params = trtorch::core::conversion::get_named_params(g->inputs(), {});
+    auto trt_results = trtorch::tests::util::RunGraphEngine(g, params, {in});
+    auto trt = trt_results[0].reshape_as(jit_results[0]);
+
+    ASSERT_TRUE(trtorch::tests::util::almostEqual(jit_results[0], trt, 2e-6));
+}
+
+// TODO: IR Parser doesnt work well with neg numbers
+TEST(Converters, ATenFlattenOtherDimsConvertsCorrectly) {
+    const auto graph = R"IR(
+      graph(%0 : Tensor):
+        %1 : int = prim::Constant[value=1]()
+        %2 : int = prim::Constant[value=2]()
+        %3 : Tensor = aten::flatten(%0, %1, %2)
+        return (%3))IR";
+
+    auto g = std::make_shared<torch::jit::Graph>();
+    torch::jit::parseIR(graph, &*g);
+
+    auto in = at::randint(0, 5, {2, 3, 3}, {at::kCUDA});
+    auto params = trtorch::core::conversion::get_named_params(g->inputs(), {});
+    auto jit_results = trtorch::tests::util::RunGraph(g, params, {in});
+
+    in = at::clone(in);
+    params = trtorch::core::conversion::get_named_params(g->inputs(), {});
+    auto trt_results = trtorch::tests::util::RunGraphEngine(g, params, {in});
+    auto trt = trt_results[0].reshape_as(jit_results[0]);
+
+    ASSERT_TRUE(trtorch::tests::util::almostEqual(jit_results[0], trt, 2e-6));
+}
+
 TEST(Converters, ATenReshapeConvertsCorrectly) {
     const auto graph = R"IR(
       graph(%0 : Tensor):

From c5b62026b1e93f85fee10f991a23fa17652ae125 Mon Sep 17 00:00:00 2001
From: Naren Dasan <naren@narendasan.com>
Date: Fri, 1 May 2020 22:04:06 -0700
Subject: [PATCH 07/13] feat(aten::matmul|aten::addmm): Adds support for
 aten::matmul and aten::admm

Signed-off-by: Naren Dasan <naren@narendasan.com>
Signed-off-by: Naren Dasan <narens@nvidia.com>
---
 core/conversion/conversion.cpp                |  6 +-
 core/conversion/converters/Arg.cpp            |  2 +
 core/conversion/converters/BUILD              |  1 +
 .../converters/impl/element_wise.cpp          |  9 +++
 core/conversion/converters/impl/linear.cpp    |  9 +--
 .../converters/impl/matrix_multiply.cpp       | 55 +++++++++++++++++++
 core/lowering/BUILD                           |  4 +-
 core/lowering/lowering.cpp                    |  1 +
 core/lowering/passes/BUILD                    |  3 +-
 core/lowering/passes/fuse_flatten_linear.cpp  | 33 -----------
 core/lowering/passes/passes.h                 |  1 +
 core/lowering/passes/unpack_addmm.cpp         | 32 +++++++++++
 core/lowering/passes/unpack_batch_norm.cpp    | 20 -------
 core/lowering/register_const_op.cpp           | 21 +++++++
 tests/core/converters/BUILD                   | 47 +++++++++-------
 .../core/converters/test_matrix_multiply.cpp  | 27 +++++++++
 tests/util/run_graph_engine.cpp               | 21 +++----
 17 files changed, 197 insertions(+), 95 deletions(-)
 create mode 100644 core/conversion/converters/impl/matrix_multiply.cpp
 create mode 100644 core/lowering/passes/unpack_addmm.cpp
 create mode 100644 core/lowering/register_const_op.cpp
 create mode 100644 tests/core/converters/test_matrix_multiply.cpp

diff --git a/core/conversion/conversion.cpp b/core/conversion/conversion.cpp
index 4b5fbf0fd2..712d355bcc 100644
--- a/core/conversion/conversion.cpp
+++ b/core/conversion/conversion.cpp
@@ -73,7 +73,11 @@ void AddLayer(ConversionCtx* ctx, const torch::jit::Node* n) {
             LOG_DEBUG(ctx->logger, "Node input is a value that needs to be evaluated");
             auto eval = EvaluateNode(ctx, input_node);
             if (eval) {
-                LOG_DEBUG(ctx->logger, "Found the value to be: " << eval.value());
+                if (!eval.value().isTensor()) {
+                    LOG_DEBUG(ctx->logger, "Found the value to be: " << eval.value());
+                } else {
+                    LOG_DEBUG(ctx->logger, "Found the value to be a tensor (shape " << eval.value().toTensor().sizes() << ')');
+                }
                 ctx->evaluated_value_map[input] = std::move(eval.value());
                 node_args.push_back(&(ctx->evaluated_value_map[input]));
             } else {
diff --git a/core/conversion/converters/Arg.cpp b/core/conversion/converters/Arg.cpp
index af23713ea1..90e4a423dc 100644
--- a/core/conversion/converters/Arg.cpp
+++ b/core/conversion/converters/Arg.cpp
@@ -89,6 +89,7 @@ std::string Arg::type_name() const {
 }
 
 const torch::jit::IValue* Arg::IValue() const {
+    TRTORCH_CHECK(isIValue(), "Requested IValue from Arg, however arg type is " << type_name());
     if (type_ == Type::kIValue) {
         return ptr_.ivalue;
     } else {
@@ -97,6 +98,7 @@ const torch::jit::IValue* Arg::IValue() const {
 }
 
 nvinfer1::ITensor* Arg::ITensor() const {
+    TRTORCH_CHECK(isITensor(), "Requested ITensor from Arg, however arg type is " << type_name());
     if (type_ == Type::kITensor) {
         return ptr_.tensor;
     } else {
diff --git a/core/conversion/converters/BUILD b/core/conversion/converters/BUILD
index ccc72bb4cb..77a0331a0a 100644
--- a/core/conversion/converters/BUILD
+++ b/core/conversion/converters/BUILD
@@ -15,6 +15,7 @@ cc_library(
         "impl/conv_deconv.cpp",
         "impl/element_wise.cpp",
         "impl/linear.cpp",
+        "impl/matrix_multiply.cpp",
         "impl/pooling.cpp",
         "impl/reduce.cpp",
         "impl/shuffle.cpp",
diff --git a/core/conversion/converters/impl/element_wise.cpp b/core/conversion/converters/impl/element_wise.cpp
index ee0e60a9d7..80d5cdbe93 100644
--- a/core/conversion/converters/impl/element_wise.cpp
+++ b/core/conversion/converters/impl/element_wise.cpp
@@ -14,6 +14,15 @@ nvinfer1::ILayer* add_elementwise(ConversionCtx* ctx, nvinfer1::ElementWiseOpera
 
     TRTORCH_CHECK(util::volume(self_dims) == util::volume(other_dims), "Found inputs to elementwise operation do not have the same number of elements:\n  Found: self " << self_dims << " other " << other_dims);
 
+    if (self_dims != other_dims) {
+        LOG_DEBUG("Input shape dont match inserting shuffle layers to reshape to " << self_dims);
+        auto other_shuffle = ctx->net->addShuffle(*other);
+        other_shuffle->setReshapeDimensions(self_dims);
+        other_shuffle->setName(std::string("[Reshape other to " + util::toStr(self_dims) + ']').c_str());
+        other = other_shuffle->getOutput(0);
+    }
+
+
     nvinfer1::ILayer* ele;
     if (scalar != 1) {
         LOG_WARNING("Please verify scalar handling in add converter, channel axis set to 3 but scaling is uniform");
diff --git a/core/conversion/converters/impl/linear.cpp b/core/conversion/converters/impl/linear.cpp
index faca5c3edd..f4c49ec020 100644
--- a/core/conversion/converters/impl/linear.cpp
+++ b/core/conversion/converters/impl/linear.cpp
@@ -9,13 +9,6 @@ namespace impl {
 namespace {
 
 auto linear_registrations = RegisterNodeConversionPatterns()
-    // .pattern({
-    //     "aten::addmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> (Tensor)",
-    //     [](ConversionCtx* ctx, const torch::jit::Node* n, args& args) -> {
-    //         auto in = args[0].ITensor();
-
-    //     }
-    // })
     .pattern({
         "aten::linear(Tensor input, Tensor weight, Tensor? bias = None) -> (Tensor)",
         [](ConversionCtx* ctx, const torch::jit::Node* n, args& args) -> bool {
@@ -71,4 +64,4 @@ auto linear_registrations = RegisterNodeConversionPatterns()
 } // namespace converters
 } // namespace conversion
 } // namespace core
-} // trtorch
+} // namespace trtorch
diff --git a/core/conversion/converters/impl/matrix_multiply.cpp b/core/conversion/converters/impl/matrix_multiply.cpp
new file mode 100644
index 0000000000..c6d2d99f1e
--- /dev/null
+++ b/core/conversion/converters/impl/matrix_multiply.cpp
@@ -0,0 +1,55 @@
+#include "core/util/prelude.h"
+#include "core/conversion/converters/converters.h"
+
+namespace trtorch {
+namespace core {
+namespace conversion {
+namespace converters {
+namespace impl {
+namespace {
+
+auto mm_registrations = RegisterNodeConversionPatterns()
+  .pattern({
+    "aten::matmul(Tensor self, Tensor other) -> (Tensor)",
+    [](ConversionCtx* ctx, const torch::jit::Node* n, args& args) -> bool {
+      nvinfer1::ITensor* self;
+      if (args[0].isIValue()) {
+        auto t = args[0].unwrapToTensor();
+        auto t_weights = Weights(ctx, t);
+        auto const_layer = ctx->net->addConstant(t_weights.shape, t_weights.data);
+        TRTORCH_CHECK(const_layer, "Unable to freeze tensor self for node: " << *n);
+        const_layer->setName((util::node_info(n) + " [Freeze Tensor(self)]").c_str());
+        self = const_layer->getOutput(0);
+      } else {
+        self = args[0].ITensor();
+      }
+      LOG_DEBUG("self tensor shape: " << self->getDimensions());
+
+      nvinfer1::ITensor* other;
+      if (args[1].isIValue()) {
+        auto t = args[1].unwrapToTensor();
+        auto t_weights = Weights(ctx, t);
+        auto const_layer = ctx->net->addConstant(t_weights.shape, t_weights.data);
+        TRTORCH_CHECK(const_layer, "Unable to freeze tensor other for node: " << *n);
+        const_layer->setName((util::node_info(n) + " [Freeze Tensor(other)]").c_str());
+        other = const_layer->getOutput(0);
+      } else {
+        other = args[1].ITensor();
+      }
+      LOG_DEBUG("other tensor shape: " << other->getDimensions());
+
+      auto mm_layer = ctx->net->addMatrixMultiply(*self, nvinfer1::MatrixOperation::kNONE, *other, nvinfer1::MatrixOperation::kNONE);
+      TRTORCH_CHECK(mm_layer, "Unable to create matrix multiplication node: " << *n);
+      mm_layer->setName(util::node_info(n).c_str());
+      auto out_tensor = ctx->AssociateValueAndTensor(n->outputs()[0], mm_layer->getOutput(0));
+
+      LOG_DEBUG("Output tensor shape: " << out_tensor->getDimensions());
+      return true;
+    }
+  });
+} // namespace
+} // namespace impl
+} // namespace converters
+} // namespace conversion
+} // namespace core
+} // namespace trtorch
\ No newline at end of file
diff --git a/core/lowering/BUILD b/core/lowering/BUILD
index 4cb1978251..4d4822a2a9 100644
--- a/core/lowering/BUILD
+++ b/core/lowering/BUILD
@@ -8,12 +8,14 @@ cc_library(
     srcs = [
         "lowering.cpp",
         "drop_unused_nodes.cpp",
+        "register_const_op.cpp"
     ],
     deps = [
         "@libtorch//:libtorch",
         "//core/lowering/passes",
         "//core/util:prelude"
-    ]
+    ],
+    alwayslink = True
 )
 
 load("@rules_pkg//:pkg.bzl", "pkg_tar")
diff --git a/core/lowering/lowering.cpp b/core/lowering/lowering.cpp
index 53ff6d36d8..22d613f1c6 100644
--- a/core/lowering/lowering.cpp
+++ b/core/lowering/lowering.cpp
@@ -25,6 +25,7 @@ void LowerGraph(std::shared_ptr<torch::jit::Graph>& g) {
     torch::jit::FuseLinear(g);
     passes::RemoveDropout(g);
     passes::FuseFlattenLinear(g);
+    passes::UnpackAddMM(g);
     passes::ExpandLogSoftmax(g);
     //passes::RemoveDimExeception(g);
     //irfusers::UnpackBatchNorm(g);
diff --git a/core/lowering/passes/BUILD b/core/lowering/passes/BUILD
index 0de638898c..38235baea5 100644
--- a/core/lowering/passes/BUILD
+++ b/core/lowering/passes/BUILD
@@ -10,7 +10,8 @@ cc_library(
         "expand_log_softmax.cpp",
         "remove_dropout.cpp",
         "unpack_batch_norm.cpp",
-        "exception_elimination.cpp"
+        "exception_elimination.cpp",
+        "unpack_addmm.cpp"
     ],
     deps = [
         "//core/util:prelude",
diff --git a/core/lowering/passes/fuse_flatten_linear.cpp b/core/lowering/passes/fuse_flatten_linear.cpp
index 6dc8ebf68e..3a00a1b109 100644
--- a/core/lowering/passes/fuse_flatten_linear.cpp
+++ b/core/lowering/passes/fuse_flatten_linear.cpp
@@ -40,39 +40,6 @@ void FuseFlattenLinear(std::shared_ptr<torch::jit::Graph>& graph) {
     flatten_linear_bias_none_to_linear.runOnGraph(graph);
 }
 
-void FuseFlattenAddMM(std::shared_ptr<torch::jit::Graph>& graph) {
-    //TensorRT implicitly adds a flatten layer infront of FC layers if necessary
-    std::string flatten_linear_pattern = R"IR(
-        graph(%input, %6, %7, %weight, %bias):
-            %flat = aten::flatten(%input, %6, %7)
-            %res = aten::linear(%flat, %weight, %bias)
-            return (%res))IR";
-    std::string flatten_linear_bias_none_pattern = R"IR(
-        graph(%input, %6, %7, %weight):
-            %flat = aten::flatten(%input, %6, %7)
-            %bias: Tensor? = prim::Constant()
-            %res = aten::linear(%flat, %weight, %bias)
-            return (%res))IR";
-    std::string fused_linear = R"IR(
-        graph(%input, %6, %7, %weight, %bias):
-            %res = aten::linear(%input, %weight, %bias)
-            return (%res))IR";
-
-    std::string fused_linear_bias_none = R"IR(
-        graph(%input, %6, %7, %weight):
-            %bias: Tensor? = prim::Constant()
-            %res = aten::linear(%input, %weight, %bias)
-            return (%res))IR";
-
-    torch::jit::SubgraphRewriter flatten_linear_to_linear;
-    flatten_linear_to_linear.RegisterRewritePattern(flatten_linear_pattern, fused_linear);
-    flatten_linear_to_linear.runOnGraph(graph);
-
-    torch::jit::SubgraphRewriter flatten_linear_bias_none_to_linear;
-    flatten_linear_bias_none_to_linear.RegisterRewritePattern(
-        flatten_linear_bias_none_pattern, fused_linear_bias_none);
-    flatten_linear_bias_none_to_linear.runOnGraph(graph);
-}
 } // namespace passes
 } // namespace lowering
 } // namespace core
diff --git a/core/lowering/passes/passes.h b/core/lowering/passes/passes.h
index 1ef02ac065..1879149f62 100644
--- a/core/lowering/passes/passes.h
+++ b/core/lowering/passes/passes.h
@@ -11,6 +11,7 @@ void FuseFlattenLinear(std::shared_ptr<torch::jit::Graph>& graph);
 void ExpandLogSoftmax(std::shared_ptr<torch::jit::Graph>& graph);
 void RemoveDropout(std::shared_ptr<torch::jit::Graph>& graph);
 void UnpackBatchNorm(std::shared_ptr<torch::jit::Graph>& graph);
+void UnpackAddMM(std::shared_ptr<torch::jit::Graph>& graph);
 void EliminateExceptionOrPassPattern(std::shared_ptr<torch::jit::Graph> graph);
 
 } // namespace irfusers
diff --git a/core/lowering/passes/unpack_addmm.cpp b/core/lowering/passes/unpack_addmm.cpp
new file mode 100644
index 0000000000..b15a752c2f
--- /dev/null
+++ b/core/lowering/passes/unpack_addmm.cpp
@@ -0,0 +1,32 @@
+#include "torch/csrc/jit/passes/fuse_linear.h"
+#include "torch/csrc/jit/passes/subgraph_rewrite.h"
+
+namespace trtorch {
+namespace core {
+namespace lowering {
+namespace passes {
+
+void UnpackAddMM(std::shared_ptr<torch::jit::Graph>& graph) {
+  //TensorRT implicitly adds a flatten layer infront of FC layers if necessary
+  std::string addmm_pattern = R"IR(
+    graph(%b, %x, %w, %1):
+      %out: Tensor = aten::addmm(%b, %x, %w, %1, %1)
+      return (%out))IR";
+  std::string mm_add_pattern = R"IR(
+    graph(%b, %x, %w, %1):
+      %mm: Tensor = aten::matmul(%x, %w)
+      %bias: Tensor = trt::const(%b)
+      %out: Tensor = aten::add_(%bias, %mm, %1)
+      return (%out))IR";
+
+
+  torch::jit::SubgraphRewriter unpack_addmm;
+  unpack_addmm.RegisterRewritePattern(addmm_pattern, mm_add_pattern);
+  unpack_addmm.runOnGraph(graph);
+}
+
+
+} // namespace passes
+} // namespace lowering
+} // namespace core
+} // namespace trtorch
diff --git a/core/lowering/passes/unpack_batch_norm.cpp b/core/lowering/passes/unpack_batch_norm.cpp
index b75af2c2f9..bcfa5930cb 100644
--- a/core/lowering/passes/unpack_batch_norm.cpp
+++ b/core/lowering/passes/unpack_batch_norm.cpp
@@ -1,25 +1,5 @@
-#include "torch/csrc/jit/runtime/custom_operator.h"
-#include "torch/csrc/jit/passes/fuse_linear.h"
 #include "torch/csrc/jit/passes/subgraph_rewrite.h"
 
-namespace torch {
-namespace jit {
-
-c10::AliasAnalysisKind aliasAnalysisFromSchema() {
-  return c10::AliasAnalysisKind::FROM_SCHEMA;
-}
-
-RegisterOperators trt_const_op_reg({
-        Operator(
-            "trt::const(Tensor val) -> Tensor",
-            [](Stack& stack) {
-                return 0; //nop
-            },
-            aliasAnalysisFromSchema())});
-
-} // namespace jit
-} // namespace torch
-
 namespace trtorch {
 namespace core {
 namespace lowering {
diff --git a/core/lowering/register_const_op.cpp b/core/lowering/register_const_op.cpp
new file mode 100644
index 0000000000..3952e978d5
--- /dev/null
+++ b/core/lowering/register_const_op.cpp
@@ -0,0 +1,21 @@
+#include "torch/csrc/jit/runtime/custom_operator.h"
+
+namespace torch {
+namespace jit {
+
+c10::AliasAnalysisKind aliasAnalysisFromSchema() {
+  return c10::AliasAnalysisKind::FROM_SCHEMA;
+}
+
+/// Op marks a Tensor to be conveted from an Torch Tensor
+/// to a TRT constant Tensor
+RegisterOperators trt_const_op_reg({
+  Operator(
+    "trt::const(Tensor val) -> Tensor",
+    [](Stack& stack) {
+      return 0; //noop
+    },
+    aliasAnalysisFromSchema())});
+
+} // namespace jit
+} // namespace torch
\ No newline at end of file
diff --git a/tests/core/converters/BUILD b/tests/core/converters/BUILD
index 39d978afa9..b47692a886 100644
--- a/tests/core/converters/BUILD
+++ b/tests/core/converters/BUILD
@@ -1,54 +1,59 @@
 load("//tests/core/converters:converter_test.bzl", "converter_test")
 
 converter_test(
-    name = "test_softmax"
+  name = "test_activation"
 )
 
 converter_test(
-    name = "test_shuffle"
+  name = "test_conv"
 )
 
 converter_test(
-    name = "test_activation"
+  name = "test_element_wise"
 )
 
 converter_test(
-    name = "test_pooling"
+  name = "test_linear"
 )
 
 converter_test(
-    name = "test_unary"
+  name = "test_matrix_multiply"
 )
 
 converter_test(
-    name = "test_linear"
+  name = "test_pooling"
 )
 
 converter_test(
-    name = "test_element_wise"
+  name = "test_reduce"
 )
 
 converter_test(
-    name = "test_conv"
+  name = "test_shuffle"
 )
 
 converter_test(
-    name = "test_reduce"
+  name = "test_softmax"
+)
+
+converter_test(
+  name = "test_unary"
 )
 
 test_suite(
-    name = "test_converters",
-    tests = [
-        ":test_softmax",
-        ":test_shuffle",
-        ":test_activation",
-        ":test_pooling",
-        ":test_unary",
-        ":test_linear",
-        ":test_element_wise",
-        ":test_conv",
-        ":test_reduce"
-    ]
+  name = "test_converters",
+  tests = [
+    ":test_activation",
+    ":test_conv",
+    ":test_element_wise",
+    ":test_linear",
+    ":test_matrix_multiply",
+    ":test_pooling",
+    ":test_reduce",
+    ":test_shuffle",
+    ":test_softmax",
+    ":test_unary",
+  ]
 )
 
 
diff --git a/tests/core/converters/test_matrix_multiply.cpp b/tests/core/converters/test_matrix_multiply.cpp
new file mode 100644
index 0000000000..8ec8035d8d
--- /dev/null
+++ b/tests/core/converters/test_matrix_multiply.cpp
@@ -0,0 +1,27 @@
+#include <string>
+#include "gtest/gtest.h"
+#include "torch/csrc/jit/ir/irparser.h"
+#include "tests/util/util.h"
+#include "core/compiler.h"
+
+// TODO: IR Parser doesnt work well with neg numbers
+TEST(Converters, ATenMMConvertsCorrectly) {
+    const auto graph = R"IR(
+      graph(%0 : Tensor, %1 : Tensor):
+        %2 : Tensor = aten::matmul(%0, %1)
+        return (%2))IR";
+
+    auto g = std::make_shared<torch::jit::Graph>();
+    torch::jit::parseIR(graph, &*g);
+
+    auto in1 = at::randint(0, 5, {2, 3}, {at::kCUDA});
+    auto in2 = at::randint(0, 5, {3, 3}, {at::kCUDA});
+    auto params = trtorch::core::conversion::get_named_params(g->inputs(), {});
+    auto jit_results = trtorch::tests::util::RunGraph(g, params, {in1, in2});
+
+    params = trtorch::core::conversion::get_named_params(g->inputs(), {});
+    auto trt_results = trtorch::tests::util::RunGraphEngine(g, params, {in1, in2});
+    auto trt = trt_results[0].reshape_as(jit_results[0]);
+
+    ASSERT_TRUE(trtorch::tests::util::almostEqual(jit_results[0], trt, 2e-6));
+}
diff --git a/tests/util/run_graph_engine.cpp b/tests/util/run_graph_engine.cpp
index 07e0a9f3ce..f9a3e3f388 100644
--- a/tests/util/run_graph_engine.cpp
+++ b/tests/util/run_graph_engine.cpp
@@ -28,34 +28,35 @@ std::vector<at::Tensor> RunEngine(std::string& eng, std::vector<at::Tensor> inpu
     std::vector<at::Tensor> contig_inputs{};
     contig_inputs.reserve(inputs.size());
     for (size_t i = 0; i < inputs.size(); i++) {
+        TRTORCH_CHECK(inputs[i].is_cuda(), "Expected input tensors to have device cuda, found device " << inputs[i].device());
+        auto expected_type = core::util::toATenDType(ctx->getEngine().getBindingDataType(i));
+        TRTORCH_CHECK(inputs[i].dtype() == expected_type, "Expected input tensors to have type " << expected_type << ", found type " << inputs[i].dtype());
         auto dims = core::util::toDimsPad(inputs[i].sizes(), 1);
         auto shape = core::util::toVec(dims);
-        contig_inputs.push_back(inputs[i].to(at::kCUDA).view(shape).contiguous());
+        contig_inputs.push_back(inputs[i].view(shape).contiguous());
         LOG_DEBUG("In shape:" << shape);
         ctx->setBindingDimensions(i, dims);
         gpu_handles.push_back(contig_inputs.back().data_ptr());
     }
 
-    if (!ctx->allInputDimensionsSpecified()) {
-        LOG_ERROR("Not enough inputs provided (tests.runEngine)");
-        return {};
-    }
+    TRTORCH_CHECK(ctx->allInputDimensionsSpecified(), "Not enough inputs provided (execution.RunCudaEngine)");
 
     std::vector<at::Tensor> outputs;
-    for (int o = inputs.size(); o < engine->getNbBindings(); o++) {
+    for (int64_t o = inputs.size(); o < engine->getNbBindings(); o++) {
         auto out_shape = ctx->getBindingDimensions(o);
         LOG_DEBUG("Output: " << engine->getBindingName(o) << " out shape: " << out_shape);
         auto dims = core::util::toVec(out_shape);
-        outputs.push_back(at::empty(dims, {at::kCUDA}).contiguous());
+        auto type = core::util::toATenDType(ctx->getEngine().getBindingDataType(o));
+        outputs.push_back(at::empty(dims, {at::kCUDA}).to(type).contiguous());
         gpu_handles.push_back(outputs[outputs.size() - 1].data_ptr());
     }
 
-
-    c10::cuda::CUDAStream stream = c10::cuda::getStreamFromPool(true, 0);
+    // Is this the right stream?
+    c10::cuda::CUDAStream stream = c10::cuda::getCurrentCUDAStream(inputs[0].device().index());
 
     ctx->enqueueV2(gpu_handles.data(), stream, nullptr);
-    stream.synchronize();
 
+    stream.synchronize();
     return outputs;
 }
 

From 054854066d51bba816486e7b5cdf7cde06bbf923 Mon Sep 17 00:00:00 2001
From: Naren Dasan <naren@narendasan.com>
Date: Sat, 2 May 2020 22:37:47 -0700
Subject: [PATCH 08/13] feat(aten::size [static]): Implement a aten::size
 converter for static input size

Signed-off-by: Naren Dasan <naren@narendasan.com>
Signed-off-by: Naren Dasan <narens@nvidia.com>
---
 core/conversion/conversion.cpp                |  2 +-
 .../conversionctx/ConversionCtx.cpp           |  6 ++++
 core/conversion/conversionctx/ConversionCtx.h |  2 +-
 core/conversion/converters/BUILD              |  1 +
 core/conversion/converters/impl/shape.cpp     | 32 +++++++++++++++++++
 core/conversion/converters/impl/shuffle.cpp   |  5 +--
 core/execution/TRTEngine.cpp                  |  2 --
 core/execution/execution.h                    |  2 +-
 8 files changed, 45 insertions(+), 7 deletions(-)
 create mode 100644 core/conversion/converters/impl/shape.cpp

diff --git a/core/conversion/conversion.cpp b/core/conversion/conversion.cpp
index 712d355bcc..248ad52b98 100644
--- a/core/conversion/conversion.cpp
+++ b/core/conversion/conversion.cpp
@@ -78,7 +78,7 @@ void AddLayer(ConversionCtx* ctx, const torch::jit::Node* n) {
                 } else {
                     LOG_DEBUG(ctx->logger, "Found the value to be a tensor (shape " << eval.value().toTensor().sizes() << ')');
                 }
-                ctx->evaluated_value_map[input] = std::move(eval.value());
+                ctx->AssociateValueAndIValue(input, eval.value());
                 node_args.push_back(&(ctx->evaluated_value_map[input]));
             } else {
                 LOG_DEBUG(ctx->logger, "Found the value is None");;
diff --git a/core/conversion/conversionctx/ConversionCtx.cpp b/core/conversion/conversionctx/ConversionCtx.cpp
index a3a5ddfc01..4afeb8238a 100644
--- a/core/conversion/conversionctx/ConversionCtx.cpp
+++ b/core/conversion/conversionctx/ConversionCtx.cpp
@@ -103,9 +103,15 @@ nvinfer1::ITensor* ConversionCtx::AssociateValueAndTensor(const torch::jit::Valu
     return tensor;
 }
 
+torch::jit::IValue* ConversionCtx::AssociateValueAndIValue(const torch::jit::Value* value, torch::jit::IValue ivalue) {
+    this->evaluated_value_map[value] = std::move(ivalue);
+    return &this->evaluated_value_map[value];
+}
+
 std::string ConversionCtx::SerializeEngine() {
     auto engine = builder->buildEngineWithConfig(*net, *cfg);
     auto serialized_engine = engine->serialize();
+    engine->destroy();
     return std::string((const char*)serialized_engine->data(), serialized_engine->size());
 }
 
diff --git a/core/conversion/conversionctx/ConversionCtx.h b/core/conversion/conversionctx/ConversionCtx.h
index c06b816107..81bf99ca7c 100644
--- a/core/conversion/conversionctx/ConversionCtx.h
+++ b/core/conversion/conversionctx/ConversionCtx.h
@@ -10,7 +10,6 @@
 
 #include "core/util/prelude.h"
 
-
 namespace trtorch {
 namespace core {
 namespace conversion {
@@ -39,6 +38,7 @@ struct ConversionCtx {
     ConversionCtx(BuilderSettings settings);
     std::string SerializeEngine();
     nvinfer1::ITensor* AssociateValueAndTensor(const torch::jit::Value* value, nvinfer1::ITensor* tensor);
+    torch::jit::IValue* AssociateValueAndIValue(const torch::jit::Value* value, torch::jit::IValue tensor);
     bool CheckLayerAddition(const torch::jit::Node* n);
 
     ~ConversionCtx();
diff --git a/core/conversion/converters/BUILD b/core/conversion/converters/BUILD
index 77a0331a0a..464ad44550 100644
--- a/core/conversion/converters/BUILD
+++ b/core/conversion/converters/BUILD
@@ -18,6 +18,7 @@ cc_library(
         "impl/matrix_multiply.cpp",
         "impl/pooling.cpp",
         "impl/reduce.cpp",
+        "impl/shape.cpp",
         "impl/shuffle.cpp",
         "impl/softmax.cpp",
         "impl/unary.cpp",
diff --git a/core/conversion/converters/impl/shape.cpp b/core/conversion/converters/impl/shape.cpp
new file mode 100644
index 0000000000..d5b3577a34
--- /dev/null
+++ b/core/conversion/converters/impl/shape.cpp
@@ -0,0 +1,32 @@
+#include "core/conversion/converters/converters.h"
+
+#include "torch/torch.h"
+
+namespace trtorch {
+namespace core {
+namespace conversion {
+namespace converters {
+namespace impl {
+namespace {
+
+static auto shape_registrations = RegisterNodeConversionPatterns()
+  .pattern({
+    // To use in static input size cases (explicit batch)
+    "aten::size.int(Tensor self, int dim) -> (Tensor)",
+    [](ConversionCtx* ctx, const torch::jit::Node* n, args& args) -> bool {
+      auto in = args[0].ITensor();
+      auto in_shape = util::toVec(in->getDimensions());
+
+      auto size = in_shape[args[1].unwrapToInt()];
+
+      ctx->AssociateValueAndIValue(n->outputs()[0], size);
+      LOG_DEBUG("Output Value: " << size);
+      return true;
+    }
+  });
+} // namespace
+} // namespace impl
+} // namespace converters
+} // namespace conversion
+} // namespace core
+} // namespace trtorch
diff --git a/core/conversion/converters/impl/shuffle.cpp b/core/conversion/converters/impl/shuffle.cpp
index f775f5790b..8a8853fe2d 100644
--- a/core/conversion/converters/impl/shuffle.cpp
+++ b/core/conversion/converters/impl/shuffle.cpp
@@ -32,11 +32,12 @@ static auto shuffle_registrations = RegisterNodeConversionPatterns()
     "aten::reshape(Tensor self, int[] shape) -> (Tensor)",
     [](ConversionCtx* ctx, const torch::jit::Node* n, args& args) -> bool {
       auto in = args[0].ITensor();
-      auto new_shape = util::toDimsPad(args[1].unwrapToIntList(), 2);
+      auto in_shape = util::toVec(in->getDimensions());
+      auto new_shape = torch::reshape(torch::rand(in_shape), args[1].unwrapToIntList().vec()).sizes();
 
       auto shuffle = ctx->net->addShuffle(*in);
       TRTORCH_CHECK(shuffle, "Unable to create shuffle layer from node: " << *n);
-      shuffle->setReshapeDimensions(new_shape);
+      shuffle->setReshapeDimensions(util::toDims(new_shape));
       shuffle->setName(util::node_info(n).c_str());
 
       auto out_tensor = ctx->AssociateValueAndTensor(n->outputs()[0], shuffle->getOutput(0));
diff --git a/core/execution/TRTEngine.cpp b/core/execution/TRTEngine.cpp
index 0364dcf1c8..3370ea6f5b 100644
--- a/core/execution/TRTEngine.cpp
+++ b/core/execution/TRTEngine.cpp
@@ -10,8 +10,6 @@ namespace trtorch {
 namespace core {
 namespace execution {
 
-TRTEngine::TRTEngine() {}
-
 TRTEngine::TRTEngine(nvinfer1::ILogger& logger, std::string& serialized_engine) {
     rt = nvinfer1::createInferRuntime(logger);
 
diff --git a/core/execution/execution.h b/core/execution/execution.h
index 2574717d8c..8c50dd4207 100644
--- a/core/execution/execution.h
+++ b/core/execution/execution.h
@@ -17,7 +17,7 @@ struct TRTEngine {
     std::pair<uint64_t, uint64_t> num_io;
     EngineID id;
 
-    TRTEngine();
+    TRTEngine() = default;
     TRTEngine(nvinfer1::ILogger& logger, std::string& serialized_engine);
     TRTEngine& operator=(const TRTEngine& other);
 };

From a9f33e4d6645d6d87844f38d4c6a03257e45204b Mon Sep 17 00:00:00 2001
From: Naren Dasan <naren@narendasan.com>
Date: Sun, 3 May 2020 18:43:44 -0700
Subject: [PATCH 09/13] fix(//core/conversion/converters/impl/element_wise):
 Fix broadcast support

Signed-off-by: Naren Dasan <naren@narendasan.com>
Signed-off-by: Naren Dasan <narens@nvidia.com>
---
 .../converters/impl/element_wise.cpp          | 30 +++++++++++--------
 tests/accuracy/accuracy_test.h                |  1 +
 tests/accuracy/test_fp16_accuracy.cpp         |  4 +--
 tests/accuracy/test_fp32_accuracy.cpp         |  4 +--
 tests/accuracy/test_int8_accuracy.cpp         |  4 +--
 5 files changed, 24 insertions(+), 19 deletions(-)

diff --git a/core/conversion/converters/impl/element_wise.cpp b/core/conversion/converters/impl/element_wise.cpp
index 80d5cdbe93..375e7a2d8f 100644
--- a/core/conversion/converters/impl/element_wise.cpp
+++ b/core/conversion/converters/impl/element_wise.cpp
@@ -8,18 +8,22 @@ namespace converters {
 namespace impl {
 namespace {
 
-nvinfer1::ILayer* add_elementwise(ConversionCtx* ctx, nvinfer1::ElementWiseOperation op,  nvinfer1::ITensor* self, nvinfer1::ITensor* other, float scalar=1) {
+nvinfer1::ILayer* add_elementwise(ConversionCtx* ctx, nvinfer1::ElementWiseOperation op,  nvinfer1::ITensor* self, nvinfer1::ITensor* other, const std::string& name, float scalar=1) {
     auto self_dims = self->getDimensions();
+    auto self_dims_vec = util::toVec(self_dims);
     auto other_dims = other->getDimensions();
+    auto other_dims_vec = util::toVec(other_dims);
+    auto other_batch = other_dims_vec[0];
 
-    TRTORCH_CHECK(util::volume(self_dims) == util::volume(other_dims), "Found inputs to elementwise operation do not have the same number of elements:\n  Found: self " << self_dims << " other " << other_dims);
+    // TODO: Proper broadcast check
+    TRTORCH_CHECK(util::volume(self_dims) == util::volume(other_dims) || util::volume(self_dims) == util::volume(other_dims) / other_batch, "Found inputs to elementwise operation do not have the same number of elements or is not broadcastable:\n  Found: self " << self_dims << " other " << other_dims);
 
     if (self_dims != other_dims) {
         LOG_DEBUG("Input shape dont match inserting shuffle layers to reshape to " << self_dims);
-        auto other_shuffle = ctx->net->addShuffle(*other);
-        other_shuffle->setReshapeDimensions(self_dims);
-        other_shuffle->setName(std::string("[Reshape other to " + util::toStr(self_dims) + ']').c_str());
-        other = other_shuffle->getOutput(0);
+        auto self_shuffle = ctx->net->addShuffle(*self);
+        self_shuffle->setReshapeDimensions(util::toDimsPad(self_dims_vec, other_dims_vec.size()));
+        self_shuffle->setName(std::string("[Reshape self to " + util::toStr(self_dims) + " for broadcasting (" + name + ")]").c_str());
+        self = self_shuffle->getOutput(0);
     }
 
 
@@ -72,7 +76,7 @@ auto element_wise_registrations = RegisterNodeConversionPatterns()
                 auto self = args[0].ITensor();
                 auto other = args[1].ITensor();
                 auto scalar = args[2].unwrapToScalar().to<float>();
-                auto add = add_elementwise(ctx, nvinfer1::ElementWiseOperation::kSUM, self, other, scalar);
+                auto add = add_elementwise(ctx, nvinfer1::ElementWiseOperation::kSUM, self, other, util::node_info(n), scalar);
 
                 TRTORCH_CHECK(add, "Unable to create add layer from node: " << *n);
 
@@ -89,7 +93,7 @@ auto element_wise_registrations = RegisterNodeConversionPatterns()
                 auto self = args[0].ITensor();
                 auto other = args[1].ITensor();
                 auto scalar = args[2].unwrapToScalar().to<float>();
-                auto add = add_elementwise(ctx, nvinfer1::ElementWiseOperation::kSUM, self, other, scalar);
+                auto add = add_elementwise(ctx, nvinfer1::ElementWiseOperation::kSUM, self, other, util::node_info(n), scalar);
 
                 TRTORCH_CHECK(add, "Unable to create add layer from node: " << *n);
 
@@ -106,7 +110,7 @@ auto element_wise_registrations = RegisterNodeConversionPatterns()
                 auto self = args[0].ITensor();
                 auto other = args[1].ITensor();
                 auto scalar = args[2].unwrapToScalar().to<float>();
-                auto sub = add_elementwise(ctx, nvinfer1::ElementWiseOperation::kSUB, self, other, scalar);
+                auto sub = add_elementwise(ctx, nvinfer1::ElementWiseOperation::kSUB, self, other, util::node_info(n), scalar);
 
                 TRTORCH_CHECK(sub, "Unable to create sub layer from node: " << *n);
 
@@ -122,7 +126,7 @@ auto element_wise_registrations = RegisterNodeConversionPatterns()
                 // Should implement self / other
                 auto self = args[0].ITensor();
                 auto other = args[1].ITensor();
-                auto div = add_elementwise(ctx, nvinfer1::ElementWiseOperation::kDIV, self, other);
+                auto div = add_elementwise(ctx, nvinfer1::ElementWiseOperation::kDIV, self, other, util::node_info(n));
 
                 TRTORCH_CHECK(div, "Unable to create div layer from node: " << *n);
 
@@ -138,7 +142,7 @@ auto element_wise_registrations = RegisterNodeConversionPatterns()
                 // TODO: Remove with functionalization
                 auto self = args[0].ITensor();
                 auto other = args[1].ITensor();
-                auto div = add_elementwise(ctx, nvinfer1::ElementWiseOperation::kDIV, self, other);
+                auto div = add_elementwise(ctx, nvinfer1::ElementWiseOperation::kDIV, self, other, util::node_info(n));
 
                 TRTORCH_CHECK(div, "Unable to create div layer from node: " << *n);
 
@@ -154,7 +158,7 @@ auto element_wise_registrations = RegisterNodeConversionPatterns()
                 // Should implement self * other
                 auto self = args[0].ITensor();
                 auto other = args[1].ITensor();
-                auto mul = add_elementwise(ctx, nvinfer1::ElementWiseOperation::kPROD, self, other);
+                auto mul = add_elementwise(ctx, nvinfer1::ElementWiseOperation::kPROD, self, other, util::node_info(n));
 
                 TRTORCH_CHECK(mul, "Unable to create mul layer from node: " << *n);
 
@@ -170,7 +174,7 @@ auto element_wise_registrations = RegisterNodeConversionPatterns()
                 // TODO: Remove with functionalization
                 auto self = args[0].ITensor();
                 auto other = args[1].ITensor();
-                auto mul = add_elementwise(ctx, nvinfer1::ElementWiseOperation::kPROD, self, other);
+                auto mul = add_elementwise(ctx, nvinfer1::ElementWiseOperation::kPROD, self, other, util::node_info(n));
 
                 TRTORCH_CHECK(mul, "Unable to create mul layer from node: " << *n);
 
diff --git a/tests/accuracy/accuracy_test.h b/tests/accuracy/accuracy_test.h
index 229608de6d..28cfe9cdca 100644
--- a/tests/accuracy/accuracy_test.h
+++ b/tests/accuracy/accuracy_test.h
@@ -20,6 +20,7 @@ class AccuracyTests
             std::cerr << "error loading the model\n";
             return;
         }
+        mod.eval();
     }
 
     void TearDown() {
diff --git a/tests/accuracy/test_fp16_accuracy.cpp b/tests/accuracy/test_fp16_accuracy.cpp
index 7ebcc8b0fb..6de40a6c31 100644
--- a/tests/accuracy/test_fp16_accuracy.cpp
+++ b/tests/accuracy/test_fp16_accuracy.cpp
@@ -24,7 +24,7 @@ TEST_P(AccuracyTests, FP16AccuracyIsClose) {
         jit_total += targets.sizes()[0];
         jit_correct += torch::sum(torch::eq(predictions, targets));
     }
-    torch::Tensor jit_accuracy = jit_correct / jit_total;
+    torch::Tensor jit_accuracy = (jit_correct / jit_total) * 100;
 
     std::vector<std::vector<int64_t>> input_shape = {{32, 3, 32, 32}};
     auto extra_info = trtorch::ExtraInfo({input_shape});
@@ -45,7 +45,7 @@ TEST_P(AccuracyTests, FP16AccuracyIsClose) {
         trt_correct += torch::sum(torch::eq(predictions, targets));
     }
 
-    torch::Tensor trt_accuracy = trt_correct / trt_total;
+    torch::Tensor trt_accuracy = (trt_correct / trt_total) * 100;
 
     ASSERT_TRUE(trtorch::tests::util::almostEqual(jit_accuracy, trt_accuracy, 3));
 }
diff --git a/tests/accuracy/test_fp32_accuracy.cpp b/tests/accuracy/test_fp32_accuracy.cpp
index b014340e82..d3d8bddb96 100644
--- a/tests/accuracy/test_fp32_accuracy.cpp
+++ b/tests/accuracy/test_fp32_accuracy.cpp
@@ -24,7 +24,7 @@ TEST_P(AccuracyTests, FP16AccuracyIsClose) {
         jit_total += targets.sizes()[0];
         jit_correct += torch::sum(torch::eq(predictions, targets));
     }
-    torch::Tensor jit_accuracy = jit_correct / jit_total;
+    torch::Tensor jit_accuracy = (jit_correct / jit_total) * 100;
 
     std::vector<std::vector<int64_t>> input_shape = {{32, 3, 32, 32}};
     auto extra_info = trtorch::ExtraInfo({input_shape});
@@ -45,7 +45,7 @@ TEST_P(AccuracyTests, FP16AccuracyIsClose) {
         trt_correct += torch::sum(torch::eq(predictions, targets));
     }
 
-    torch::Tensor trt_accuracy = trt_correct / trt_total;
+    torch::Tensor trt_accuracy = (trt_correct / trt_total) * 100;
 
     ASSERT_TRUE(trtorch::tests::util::almostEqual(jit_accuracy, trt_accuracy, 3));
 }
diff --git a/tests/accuracy/test_int8_accuracy.cpp b/tests/accuracy/test_int8_accuracy.cpp
index 07d399c96d..aa4824948a 100644
--- a/tests/accuracy/test_int8_accuracy.cpp
+++ b/tests/accuracy/test_int8_accuracy.cpp
@@ -54,7 +54,7 @@ TEST_P(AccuracyTests, FP16AccuracyIsClose) {
         jit_total += targets.sizes()[0];
         jit_correct += torch::sum(torch::eq(predictions, targets));
     }
-    torch::Tensor jit_accuracy = jit_correct / jit_total;
+    torch::Tensor jit_accuracy = (jit_correct / jit_total) * 100;
 
     // Compile Graph
     auto trt_mod = trtorch::CompileGraph(mod, extra_info);
@@ -72,7 +72,7 @@ TEST_P(AccuracyTests, FP16AccuracyIsClose) {
         trt_total += targets.sizes()[0];
         trt_correct += torch::sum(torch::eq(predictions, targets)).item().toFloat();
     }
-    torch::Tensor trt_accuracy = trt_correct / trt_total;
+    torch::Tensor trt_accuracy = (trt_correct / trt_total) * 100;
 
     ASSERT_TRUE(trtorch::tests::util::almostEqual(jit_accuracy, trt_accuracy, 3));
 }

From cd6b1b9f9e74ed7bccf0d5e1b32a2916b667099f Mon Sep 17 00:00:00 2001
From: Naren Dasan <naren@narendasan.com>
Date: Sun, 3 May 2020 20:30:23 -0700
Subject: [PATCH 10/13] docs: Clean up testing and documentation

Signed-off-by: Naren Dasan <naren@narendasan.com>
Signed-off-by: Naren Dasan <narens@nvidia.com>
---
 BUILD                                         |  2 +-
 README.md                                     |  4 ++++
 core/compiler.cpp                             |  2 +-
 core/lowering/lowering.cpp                    |  2 +-
 core/lowering/passes/BUILD                    | 10 +++++-----
 .../lowering/passes/exception_elimination.cpp |  1 -
 core/lowering/passes/fuse_flatten_linear.cpp  |  3 +++
 core/lowering/passes/passes.h                 |  4 ++--
 core/lowering/passes/remove_dropout.cpp       |  3 +++
 core/lowering/passes/unpack_addmm.cpp         |  3 +++
 core/lowering/passes/unpack_batch_norm.cpp    |  3 +++
 ...log_softmax.cpp => unpack_log_softmax.cpp} |  5 ++++-
 core/util/logging/TRTorchLogger.cpp           |  2 +-
 cpp/api/include/trtorch/logging.h             | 19 ++++++++++---------
 cpp/api/src/logging.cpp                       |  9 +++++++--
 cpp/trtorchexec/main.cpp                      |  4 ++++
 16 files changed, 52 insertions(+), 24 deletions(-)
 rename core/lowering/passes/{expand_log_softmax.cpp => unpack_log_softmax.cpp} (93%)

diff --git a/BUILD b/BUILD
index 28eaf3862c..48f2ae5338 100644
--- a/BUILD
+++ b/BUILD
@@ -11,7 +11,7 @@ pkg_tar(
         "//core/conversion/evaluators:include",
         "//core/execution:include",
         "//core/lowering:include",
-        "//core/lowering/irfusers:include",
+        "//core/lowering/passes:include",
         "//core/util:include",
         "//core/util/logging:include"
     ],
diff --git a/README.md b/README.md
index 599c887daa..c06c02c5f4 100644
--- a/README.md
+++ b/README.md
@@ -149,6 +149,10 @@ Thanks for wanting to contribute! There are two main ways to handle supporting a
 
 You can register a converter for your op using the `NodeConverterRegistry` inside your application.
 
+## Known Limitations
+
+- You cannot use both Adaptive Pooling in PyTorch and also use TRTorch Dynamic input shape
+
 ## Structure of the repo
 
 | Component     | Description                                                  |
diff --git a/core/compiler.cpp b/core/compiler.cpp
index b6efa3869d..f8d2861e8e 100644
--- a/core/compiler.cpp
+++ b/core/compiler.cpp
@@ -102,7 +102,7 @@ torch::jit::script::Module CompileGraph(const torch::jit::script::Module& mod,
                                         ExtraInfo cfg) {
     // TODO: Should be doing a functional transform but need PR #31978
     // [jit] More robust mangling
-    // torch::jit::script::Module new_mod = mod.clone();
+    //torch::jit::script::Module new_mod = mod.clone();
     torch::jit::script::Module new_mod(mod._ivalue()->name() + "_trt");
     std::vector<std::shared_ptr<torch::jit::Graph>> graphs;
     for (const torch::jit::script::Method& method : mod.get_methods()) {
diff --git a/core/lowering/lowering.cpp b/core/lowering/lowering.cpp
index 22d613f1c6..1fbbb107e4 100644
--- a/core/lowering/lowering.cpp
+++ b/core/lowering/lowering.cpp
@@ -26,7 +26,7 @@ void LowerGraph(std::shared_ptr<torch::jit::Graph>& g) {
     passes::RemoveDropout(g);
     passes::FuseFlattenLinear(g);
     passes::UnpackAddMM(g);
-    passes::ExpandLogSoftmax(g);
+    passes::UnpackLogSoftmax(g);
     //passes::RemoveDimExeception(g);
     //irfusers::UnpackBatchNorm(g);
     torch::jit::EliminateDeadCode(g);
diff --git a/core/lowering/passes/BUILD b/core/lowering/passes/BUILD
index 38235baea5..99e92c1829 100644
--- a/core/lowering/passes/BUILD
+++ b/core/lowering/passes/BUILD
@@ -6,12 +6,12 @@ cc_library(
         "passes.h",
     ],
     srcs = [
+        "exception_elimination.cpp",
         "fuse_flatten_linear.cpp",
-        "expand_log_softmax.cpp",
         "remove_dropout.cpp",
+        "unpack_addmm.cpp",
         "unpack_batch_norm.cpp",
-        "exception_elimination.cpp",
-        "unpack_addmm.cpp"
+        "unpack_log_softmax.cpp",
     ],
     deps = [
         "//core/util:prelude",
@@ -23,7 +23,7 @@ load("@rules_pkg//:pkg.bzl", "pkg_tar")
 
 pkg_tar(
     name = "include",
-    package_dir = "core/lowering/irfusers/",
-    srcs = ["irfusers.h"],
+    package_dir = "core/lowering/passes/",
+    srcs = ["passes.h"],
 )
 
diff --git a/core/lowering/passes/exception_elimination.cpp b/core/lowering/passes/exception_elimination.cpp
index 54f36a833a..1f9fb35a65 100644
--- a/core/lowering/passes/exception_elimination.cpp
+++ b/core/lowering/passes/exception_elimination.cpp
@@ -21,7 +21,6 @@ struct ExceptionOrPassPatternElimination {
     : graph_(std::move(graph)) {}
 
   void run() {
-    LOG_GRAPH("Pre exeception or pass elimination: " << *graph_);
     findExceptionOrPassNodes(graph_->block());
     torch::jit::EliminateDeadCode(graph_);
     LOG_GRAPH("Post exeception or pass elimination: " << *graph_);
diff --git a/core/lowering/passes/fuse_flatten_linear.cpp b/core/lowering/passes/fuse_flatten_linear.cpp
index 3a00a1b109..4a8a7112d9 100644
--- a/core/lowering/passes/fuse_flatten_linear.cpp
+++ b/core/lowering/passes/fuse_flatten_linear.cpp
@@ -1,6 +1,8 @@
 #include "torch/csrc/jit/passes/fuse_linear.h"
 #include "torch/csrc/jit/passes/subgraph_rewrite.h"
 
+#include "core/util/prelude.h"
+
 namespace trtorch {
 namespace core {
 namespace lowering {
@@ -38,6 +40,7 @@ void FuseFlattenLinear(std::shared_ptr<torch::jit::Graph>& graph) {
     flatten_linear_bias_none_to_linear.RegisterRewritePattern(
         flatten_linear_bias_none_pattern, fused_linear_bias_none);
     flatten_linear_bias_none_to_linear.runOnGraph(graph);
+    LOG_GRAPH("Post flatten linear: " << *graph);
 }
 
 } // namespace passes
diff --git a/core/lowering/passes/passes.h b/core/lowering/passes/passes.h
index 1879149f62..d4efc198db 100644
--- a/core/lowering/passes/passes.h
+++ b/core/lowering/passes/passes.h
@@ -8,10 +8,10 @@ namespace lowering {
 namespace passes {
 
 void FuseFlattenLinear(std::shared_ptr<torch::jit::Graph>& graph);
-void ExpandLogSoftmax(std::shared_ptr<torch::jit::Graph>& graph);
 void RemoveDropout(std::shared_ptr<torch::jit::Graph>& graph);
-void UnpackBatchNorm(std::shared_ptr<torch::jit::Graph>& graph);
 void UnpackAddMM(std::shared_ptr<torch::jit::Graph>& graph);
+void UnpackBatchNorm(std::shared_ptr<torch::jit::Graph>& graph);
+void UnpackLogSoftmax(std::shared_ptr<torch::jit::Graph>& graph);
 void EliminateExceptionOrPassPattern(std::shared_ptr<torch::jit::Graph> graph);
 
 } // namespace irfusers
diff --git a/core/lowering/passes/remove_dropout.cpp b/core/lowering/passes/remove_dropout.cpp
index b28fe011a8..ccac4c3d8a 100644
--- a/core/lowering/passes/remove_dropout.cpp
+++ b/core/lowering/passes/remove_dropout.cpp
@@ -1,6 +1,8 @@
 #include <torch/csrc/jit/passes/fuse_linear.h>
 #include <torch/csrc/jit/passes/subgraph_rewrite.h>
 
+#include "core/util/prelude.h"
+
 namespace trtorch {
 namespace core {
 namespace lowering {
@@ -20,6 +22,7 @@ void RemoveDropout(std::shared_ptr<torch::jit::Graph>& graph) {
     remove_dropout.RegisterRewritePattern(
         dropout_pattern, no_dropout_pattern);
     remove_dropout.runOnGraph(graph);
+    LOG_GRAPH("Post remove dropout: " << *graph);
 }
 
 } // namespace passes
diff --git a/core/lowering/passes/unpack_addmm.cpp b/core/lowering/passes/unpack_addmm.cpp
index b15a752c2f..ce2cc046e8 100644
--- a/core/lowering/passes/unpack_addmm.cpp
+++ b/core/lowering/passes/unpack_addmm.cpp
@@ -1,6 +1,8 @@
 #include "torch/csrc/jit/passes/fuse_linear.h"
 #include "torch/csrc/jit/passes/subgraph_rewrite.h"
 
+#include "core/util/prelude.h"
+
 namespace trtorch {
 namespace core {
 namespace lowering {
@@ -23,6 +25,7 @@ void UnpackAddMM(std::shared_ptr<torch::jit::Graph>& graph) {
   torch::jit::SubgraphRewriter unpack_addmm;
   unpack_addmm.RegisterRewritePattern(addmm_pattern, mm_add_pattern);
   unpack_addmm.runOnGraph(graph);
+  LOG_GRAPH("Post unpack addmm: " << *graph);
 }
 
 
diff --git a/core/lowering/passes/unpack_batch_norm.cpp b/core/lowering/passes/unpack_batch_norm.cpp
index bcfa5930cb..19a9ad29bd 100644
--- a/core/lowering/passes/unpack_batch_norm.cpp
+++ b/core/lowering/passes/unpack_batch_norm.cpp
@@ -1,5 +1,7 @@
 #include "torch/csrc/jit/passes/subgraph_rewrite.h"
 
+#include "core/util/prelude.h"
+
 namespace trtorch {
 namespace core {
 namespace lowering {
@@ -39,6 +41,7 @@ void UnpackBatchNorm(std::shared_ptr<torch::jit::Graph>& graph) {
     torch::jit::SubgraphRewriter unpack_batch_norm;
     unpack_batch_norm.RegisterRewritePattern(batch_norm_pattern, expanded_batch_norm_pattern);
     unpack_batch_norm.runOnGraph(graph);
+    LOG_GRAPH("Post unpack batchnorm: " << *graph);
 }
 } // Namespace passes
 } // namespace lowering
diff --git a/core/lowering/passes/expand_log_softmax.cpp b/core/lowering/passes/unpack_log_softmax.cpp
similarity index 93%
rename from core/lowering/passes/expand_log_softmax.cpp
rename to core/lowering/passes/unpack_log_softmax.cpp
index 37bbcdd321..7a53e64e9e 100644
--- a/core/lowering/passes/expand_log_softmax.cpp
+++ b/core/lowering/passes/unpack_log_softmax.cpp
@@ -1,12 +1,14 @@
 #include "torch/csrc/jit/passes/fuse_linear.h"
 #include "torch/csrc/jit/passes/subgraph_rewrite.h"
 
+#include "core/util/prelude.h"
+
 namespace trtorch {
 namespace core {
 namespace lowering {
 namespace passes {
 
-void ExpandLogSoftmax(std::shared_ptr<torch::jit::Graph>& graph) {
+void UnpackLogSoftmax(std::shared_ptr<torch::jit::Graph>& graph) {
     // Its easier for TensorRT if we seperate softmax and log
     // There might need to be a reshape inserted see:
     // https://github.com/onnx/onnx-tensorrt/blob/5dca8737851118f6ab8a33ea1f7bcb7c9f06caf5/builtin_op_importers.cpp#L1593
@@ -43,6 +45,7 @@ void ExpandLogSoftmax(std::shared_ptr<torch::jit::Graph>& graph) {
     logsoftmax_none_to_softmax_log_none.RegisterRewritePattern(
         logsoftmax_none_pattern, softmax_log_none_pattern);
     logsoftmax_none_to_softmax_log_none.runOnGraph(graph);
+    LOG_GRAPH("Post unpack logsoftmax: " << *graph);
 }
 
 } // namespace passes
diff --git a/core/util/logging/TRTorchLogger.cpp b/core/util/logging/TRTorchLogger.cpp
index 7c80152076..678506c09f 100644
--- a/core/util/logging/TRTorchLogger.cpp
+++ b/core/util/logging/TRTorchLogger.cpp
@@ -101,7 +101,7 @@ namespace {
 TRTorchLogger& get_global_logger() {
     #ifndef NDEBUG
     static TRTorchLogger global_logger("[TRTorch - Debug Build] - ",
-                                       LogLevel::kGRAPH,
+                                       LogLevel::kDEBUG,
                                        true);
     #else
     static TRTorchLogger global_logger("[TRTorch] - ",
diff --git a/cpp/api/include/trtorch/logging.h b/cpp/api/include/trtorch/logging.h
index 8841c0d47c..d841be33c1 100644
--- a/cpp/api/include/trtorch/logging.h
+++ b/cpp/api/include/trtorch/logging.h
@@ -9,11 +9,12 @@ namespace logging {
  * Emum for setting message severity
  */
 enum Level {
-    kINTERNAL_ERROR,
-    kERROR,
-    kWARNING,
-    kINFO,
-    kDEBUG,
+    kINTERNAL_ERROR, // Only print messages for internal errors
+    kERROR,          // Print all internal errors and errors (default)
+    kWARNING,        // Print warnings and errors
+    kINFO,           // Print all info, warnings and errors
+    kDEBUG,          // Print all debug info, info, warnings and errors
+    kGRAPH,          // Print everything including the intermediate graphs of the lowering phase
 };
 
 // Are these ones necessary for the user?
@@ -35,7 +36,7 @@ TRTORCH_API void set_reportable_log_level(Level lvl);
 TRTORCH_API void set_is_colored_output_on(bool colored_output_on);
 
 /**
- * @brief Get the current reportable log level 
+ * @brief Get the current reportable log level
  */
 TRTORCH_API Level get_reportable_log_level();
 
@@ -45,10 +46,10 @@ TRTORCH_API Level get_reportable_log_level();
 TRTORCH_API bool get_is_colored_output_on();
 
 /**
- * @brief Adds a message to the global log 
+ * @brief Adds a message to the global log
  *
- * @param lvl: trtorch::logging::Level - Severity of the message 
- * @param msg: std::string - Message to be logged 
+ * @param lvl: trtorch::logging::Level - Severity of the message
+ * @param msg: std::string - Message to be logged
  */
 // Dont know if we want this?
 TRTORCH_API void log(Level lvl, std::string msg);
diff --git a/cpp/api/src/logging.cpp b/cpp/api/src/logging.cpp
index 25fbfd0bfe..2a82c19ce7 100644
--- a/cpp/api/src/logging.cpp
+++ b/cpp/api/src/logging.cpp
@@ -7,7 +7,7 @@ namespace logging {
 std::string get_logging_prefix() {
     return core::util::logging::get_logger().get_logging_prefix();
 }
-    
+
 void set_logging_prefix(std::string prefix) {
     core::util::logging::get_logger().set_logging_prefix(prefix);
 }
@@ -27,6 +27,9 @@ void set_reportable_log_level(Level lvl) {
     case Level::kINFO:
         log_lvl = core::util::logging::LogLevel::kINFO;
         break;
+    case Level::kGRAPH:
+        log_lvl = core::util::logging::LogLevel::kGRAPH;
+        break;
     case Level::kDEBUG:
     default:
         log_lvl = core::util::logging::LogLevel::kDEBUG;
@@ -50,12 +53,14 @@ Level get_reportable_log_level() {
         return Level::kWARNING;
     case core::util::logging::LogLevel::kINFO:
         return Level::kINFO;
+    case core::util::logging::LogLevel::kGRAPH:
+        return Level::kGRAPH;
     case core::util::logging::LogLevel::kDEBUG:
     default:
         return Level::kDEBUG;
     }
 }
-        
+
 bool get_is_colored_output_on() {
     return core::util::logging::get_logger().get_is_colored_output_on();
 }
diff --git a/cpp/trtorchexec/main.cpp b/cpp/trtorchexec/main.cpp
index f3909fef27..e6f18233e9 100644
--- a/cpp/trtorchexec/main.cpp
+++ b/cpp/trtorchexec/main.cpp
@@ -55,11 +55,13 @@ int main(int argc, const char* argv[]) {
         dims.push_back(v);
     }
 
+    std::cout << "Checking operator support" << std::endl;
     if (!trtorch::CheckMethodOperatorSupport(mod, "forward")) {
         std::cerr << "Method is not currently supported by TRTorch" << std::endl;
         return -1;
     }
 
+    std::cout << "Compiling graph to save as TRT engine (/tmp/engine_converted_from_jit.trt)" << std::endl;
     auto engine = trtorch::ConvertGraphToTRTEngine(mod, "forward", dims);
     std::ofstream out("/tmp/engine_converted_from_jit.trt");
     out << engine;
@@ -75,7 +77,9 @@ int main(int argc, const char* argv[]) {
     std::vector<at::Tensor> jit_results;
     jit_results.push_back(jit_results_ivalues.toTensor());
 
+    std::cout << "Compiling graph as module" << std::endl;
     auto trt_mod = trtorch::CompileGraph(mod, dims);
+    std::cout << "Running TRT module" << std::endl;
     torch::jit::IValue trt_results_ivalues = trt_mod.forward(trt_inputs_ivalues);
     std::vector<at::Tensor> trt_results;
     trt_results.push_back(trt_results_ivalues.toTensor());

From 78a1c614130030e0ca8bf477dfd8c35bccdebe25 Mon Sep 17 00:00:00 2001
From: Naren Dasan <naren@narendasan.com>
Date: Sun, 3 May 2020 20:42:25 -0700
Subject: [PATCH 11/13] feat(//core/conversion/conversionctx): Make op
 precision available at conversion time through ctx

Signed-off-by: Naren Dasan <naren@narendasan.com>
Signed-off-by: Naren Dasan <narens@nvidia.com>
---
 core/conversion/conversion.cpp                  | 4 ++++
 core/conversion/conversionctx/ConversionCtx.cpp | 1 +
 core/conversion/conversionctx/ConversionCtx.h   | 1 +
 3 files changed, 6 insertions(+)

diff --git a/core/conversion/conversion.cpp b/core/conversion/conversion.cpp
index 248ad52b98..c602e48b89 100644
--- a/core/conversion/conversion.cpp
+++ b/core/conversion/conversion.cpp
@@ -160,6 +160,10 @@ void AddInputs(ConversionCtx* ctx,
     TRTORCH_CHECK(profile->isValid(), "Optimization profile is invalid, please check the input range provided (conversion.AddInputs)");
 
     ctx->cfg->addOptimizationProfile(profile);
+    // TODO: Enable in TRT 7.1
+    // if (ctx->op_precision == nvinfer1::DataType::kINT8) {
+    //     ctx->cfg->setCalibrationProfile(profile);
+    // }
 }
 
 void MarkOutputs(ConversionCtx* ctx, at::ArrayRef<const torch::jit::Value*> outputs) {
diff --git a/core/conversion/conversionctx/ConversionCtx.cpp b/core/conversion/conversionctx/ConversionCtx.cpp
index 4afeb8238a..acde8024c3 100644
--- a/core/conversion/conversionctx/ConversionCtx.cpp
+++ b/core/conversion/conversionctx/ConversionCtx.cpp
@@ -60,6 +60,7 @@ ConversionCtx::ConversionCtx(BuilderSettings build_settings)
         input_type = nvinfer1::DataType::kFLOAT;
         break;
     }
+    op_precision = settings.op_precision;
 
     if (settings.refit) {
         cfg->setFlag(nvinfer1::BuilderFlag::kREFIT);
diff --git a/core/conversion/conversionctx/ConversionCtx.h b/core/conversion/conversionctx/ConversionCtx.h
index 81bf99ca7c..1d2581fdc9 100644
--- a/core/conversion/conversionctx/ConversionCtx.h
+++ b/core/conversion/conversionctx/ConversionCtx.h
@@ -47,6 +47,7 @@ struct ConversionCtx {
     nvinfer1::INetworkDefinition* net;
     nvinfer1::IBuilderConfig* cfg;
     nvinfer1::DataType input_type;
+    nvinfer1::DataType op_precision;
     BuilderSettings settings;
     util::logging::TRTorchLogger logger;
     // Pointers to data that needs to remain alive until conversion is done

From 2d65ecef8edeb3b6a854ec8c803c5dae17e82815 Mon Sep 17 00:00:00 2001
From: Naren Dasan <naren@narendasan.com>
Date: Sun, 3 May 2020 20:54:48 -0700
Subject: [PATCH 12/13] fix(//core/conversion/conversionctx): Check both tensor
 and eval maps before throwing conversion warning

Signed-off-by: Naren Dasan <naren@narendasan.com>
Signed-off-by: Naren Dasan <narens@nvidia.com>
---
 core/conversion/conversionctx/ConversionCtx.cpp | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/core/conversion/conversionctx/ConversionCtx.cpp b/core/conversion/conversionctx/ConversionCtx.cpp
index acde8024c3..7037e9a512 100644
--- a/core/conversion/conversionctx/ConversionCtx.cpp
+++ b/core/conversion/conversionctx/ConversionCtx.cpp
@@ -118,10 +118,13 @@ std::string ConversionCtx::SerializeEngine() {
 
 bool ConversionCtx::CheckLayerAddition(const torch::jit::Node* n) {
     for (auto out : n->outputs()) {
-        auto iter = this->value_tensor_map.find(out);
-        if (iter == this->value_tensor_map.end()) {
-            LOG_WARNING("Node " << util::node_info(n) << " output: " << out->debugName() << " does not have a coresponding output, may potentially indicate a defective converter");
-            return false;
+        auto iter_t = this->value_tensor_map.find(out);
+        if (iter_t == this->value_tensor_map.end()) {
+            auto iter_iv = this->evaluated_value_map.find(out);
+            if (iter_iv == this->evaluated_value_map.end()) {
+                LOG_WARNING("Node " << util::node_info(n) << " output: " << out->debugName() << " does not have a coresponding value or tensor, may potentially indicate a defective evaluator or converter");
+                return false;
+            }
         }
     }
     return true;

From 9d0bdafff2361238abb2d49ff71c757e93123462 Mon Sep 17 00:00:00 2001
From: Naren Dasan <naren@narendasan.com>
Date: Sun, 3 May 2020 20:58:54 -0700
Subject: [PATCH 13/13] refactor(//core/conversion): Run output checking on
 both converters and evaluators

Signed-off-by: Naren Dasan <naren@narendasan.com>
Signed-off-by: Naren Dasan <narens@nvidia.com>
---
 core/conversion/conversion.cpp | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/core/conversion/conversion.cpp b/core/conversion/conversion.cpp
index c602e48b89..2d6b704f40 100644
--- a/core/conversion/conversion.cpp
+++ b/core/conversion/conversion.cpp
@@ -214,9 +214,7 @@ void ConvertBlockToNetDef(ConversionCtx* ctx, const torch::jit::Block* b, Conver
     }
 
     for (const auto n : nodes) {
-        if (converters::node_is_convertable(n)) {
-            ctx->CheckLayerAddition(n);
-        }
+        ctx->CheckLayerAddition(n);
     }
 
     auto outputs = b->outputs();