bertmaher
diff --git a/‎caffe2/CMakeLists.txt
Lines changed: 1 addition & 0 deletions b/‎caffe2/CMakeLists.txt
Lines changed: 1 addition & 0 deletions
diff --git a/‎test/test_tensorexpr.py
Lines changed: 19 additions & 0 deletions b/‎test/test_tensorexpr.py
Lines changed: 19 additions & 0 deletions
diff --git a/‎torch/csrc/jit/passes/guard_elimination.cpp
Lines changed: 2 additions & 1 deletion b/‎torch/csrc/jit/passes/guard_elimination.cpp
Lines changed: 2 additions & 1 deletion
diff --git a/‎torch/csrc/jit/passes/tensorexpr_fuser.cpp
Lines changed: 12 additions & 9 deletions b/‎torch/csrc/jit/passes/tensorexpr_fuser.cpp
Lines changed: 12 additions & 9 deletions
diff --git a/‎torch/csrc/jit/tensorexpr/function.cpp
Lines changed: 30 additions & 7 deletions b/‎torch/csrc/jit/tensorexpr/function.cpp
Lines changed: 30 additions & 7 deletions
diff --git a/‎torch/csrc/jit/tensorexpr/ir.cpp
Lines changed: 16 additions & 5 deletions b/‎torch/csrc/jit/tensorexpr/ir.cpp
Lines changed: 16 additions & 5 deletions
@@ -466,6 +466,7 @@ if (NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE)
     ${TORCH_SRC_DIR}/csrc/jit/tensorexpr/kernel.cpp
     ${TORCH_SRC_DIR}/csrc/jit/tensorexpr/llvm_codegen.cpp
     ${TORCH_SRC_DIR}/csrc/jit/tensorexpr/llvm_jit.cpp
+    ${TORCH_SRC_DIR}/csrc/jit/tensorexpr/native.cpp
     ${TORCH_SRC_DIR}/csrc/jit/tensorexpr/types.cpp
     ${TORCH_SRC_DIR}/csrc/jit/tensorexpr/ir_printer.cpp
     ${TORCH_SRC_DIR}/csrc/jit/tensorexpr/ir_mutator.cpp
 
@@ -250,6 +250,25 @@ def np_easy(x, y, z):
         npr = np_easy(a.numpy(), b.numpy(), c.numpy())
         np.testing.assert_allclose(npr, x.numpy())
 
+    def test_matmul(self):
+        llvm = LLVMCodeGenExecuted()
+        def easy(x, y):
+            aaa, bbb = torch.chunk(y, 2)
+            y = torch.cat([aaa, bbb], dim=0)
+            aaa = torch.matmul(x, y) * 3
+            return aaa
+
+        shape = (128,128)
+        a = torch.rand(shape)
+        b = torch.rand(shape)
+        traced = torch.jit.trace(
+            easy, (a, b)
+        )
+
+        x = traced(a, b)
+        y = 3 * (a @ b)
+        np.testing.assert_allclose(y.numpy(), x.numpy(), rtol=1e-5, atol=1e-3)
+        assert llvm.elapsed_value() == 1
 
     def test_broadcast(self):
         def easy(x, y, z):
 
@@ -1,8 +1,8 @@
-#include <torch/csrc/jit/passes/guard_elimination.h>
 #include <torch/csrc/jit/graph_executor.h>
 #include <torch/csrc/jit/jit_log.h>
 #include <torch/csrc/jit/passes/alias_analysis.h>
 #include <torch/csrc/jit/passes/constant_propagation.h>
+#include <torch/csrc/jit/passes/guard_elimination.h>
 #include <torch/csrc/jit/passes/peephole.h>
 #include <memory>
 #include <unordered_set>
@@ -243,6 +243,7 @@ struct GuardElimination {
       case aten::rsqrt:
       case aten::remainder:
       case aten::mm:
+      case aten::matmul:
       case aten::min:
       case aten::max:
       case aten::type_as:
 
@@ -8,6 +8,7 @@
 #include <torch/csrc/jit/passes/dead_code_elimination.h>
 #include <torch/csrc/jit/passes/utils/subgraph_utils.h>
 #include <torch/csrc/jit/tensorexpr/kernel.h>
+#include <torch/csrc/jit/tensorexpr/native.h>
 
 using namespace torch::jit;
 using namespace torch::jit::tensorexpr;
@@ -119,7 +120,12 @@ bool isSupported(Node* node) {
     case aten::__rshift__:
     case aten::where:
       return true;
-    default:
+    default: {
+      auto& nfr = getNativeFunctionRegistry();
+      if (nfr.count(node->kind().toQualString())) {
+        return true;
+      }
+    }
       return false;
   }
 }
@@ -140,10 +146,7 @@ bool canHandle(Node* node, AliasDb& aliasDb) {
     return false;                           \
   }
 
-bool canMerge(
-    Node* consumer,
-    Node* producer,
-    AliasDb& aliasDb) {
+bool canMerge(Node* consumer, Node* producer, AliasDb& aliasDb) {
   // Only handle complete tensor types
   for (torch::jit::Value* output : consumer->outputs()) {
     REQ(output->isCompleteTensor());
@@ -162,8 +165,7 @@ bool canMerge(
   REQ(aliasDb.couldMoveAfterTopologically(consumer, producer));
 
   // Ops that return aliases can only be folded if this is the only use.
-  if (producer->kind() == aten::slice ||
-      producer->kind() == aten::unsqueeze ||
+  if (producer->kind() == aten::slice || producer->kind() == aten::unsqueeze ||
       producer->kind() == prim::ConstantChunk) {
     for (auto& use : producer->output(0)->uses()) {
       REQ(use.user == consumer);
@@ -196,11 +198,12 @@ bool canMerge(
 }
 #undef REQ
 
-Node *getOrCreateTensorExprSubgraph(Node *n) {
+Node* getOrCreateTensorExprSubgraph(Node* n) {
   if (n->hasAttribute(attr::Subgraph) && n->kind() == getTensorExprSymbol()) {
     return n;
   }
-  auto te_group = SubgraphUtils::createSingletonSubgraph(n, getTensorExprSymbol());
+  auto te_group =
+      SubgraphUtils::createSingletonSubgraph(n, getTensorExprSymbol());
   GRAPH_UPDATE("getOrCreateTensorExprSubgraph: ", *te_group);
   return te_group;
 }
 
@@ -45,8 +45,8 @@ Tensor* Compute(
   std::vector<const Var*> args;
   unpack_dim_args(dim_args, &dims, &args);
   const Expr* body = body_func(VarHandle(args[0])).node();
-  Function* func =
-      new Function(func_name, std::move(dims), std::move(args), std::move(body));
+  Function* func = new Function(
+      func_name, std::move(dims), std::move(args), std::move(body));
   return new Tensor(func, 0);
 }
 
@@ -67,12 +67,16 @@ Tensor* Compute(
 Tensor* Compute(
     const std::string& func_name,
     const std::vector<DimArg>& dim_args,
-    std::function<ExprHandle(const VarHandle&, const VarHandle&, const VarHandle&)> body_func) {
+    std::function<
+        ExprHandle(const VarHandle&, const VarHandle&, const VarHandle&)>
+        body_func) {
   CHECK_EQ(dim_args.size(), 3ULL);
   std::vector<const Expr*> dims;
   std::vector<const Var*> args;
   unpack_dim_args(dim_args, &dims, &args);
-  const Expr* body = body_func(VarHandle(args[0]), VarHandle(args[1]), VarHandle(args[2])).node();
+  const Expr* body =
+      body_func(VarHandle(args[0]), VarHandle(args[1]), VarHandle(args[2]))
+          .node();
   Function* func = new Function(
       func_name, std::move(dims), std::move(args), std::move(body));
   return new Tensor(func, 0);
@@ -81,8 +85,11 @@ Tensor* Compute(
 Tensor* Compute(
     const std::string& func_name,
     const std::vector<DimArg>& dim_args,
-    std::function<ExprHandle(const VarHandle&, const VarHandle&, const VarHandle&, const VarHandle&)>
-        body_func) {
+    std::function<ExprHandle(
+        const VarHandle&,
+        const VarHandle&,
+        const VarHandle&,
+        const VarHandle&)> body_func) {
   CHECK_EQ(dim_args.size(), 4ULL);
   std::vector<const Expr*> dims;
   std::vector<const Var*> args_nodes;
@@ -96,6 +103,21 @@ Tensor* Compute(
 
 Stmt* Function::ElementStmt(size_t index) {
   std::vector<ExprHandle> strides(dims_.size());
+  auto* ce = dynamic_cast<const CallExternal*>(body(index));
+  if (ce != nullptr) {
+    std::vector<const Var*> input_vars;
+    std::vector<const Expr*> input_args;
+    for (auto p : ce->params()) {
+      auto fc = dynamic_cast<const FunctionCall*>(p);
+      if (fc) {
+        input_vars.emplace_back(fc->tensor()->function()->func_var(index));
+      } else {
+        input_args.emplace_back(p);
+      }
+    }
+    return OpaqueCall::make(
+        ce->name(), func_var(index), input_vars, input_args);
+  }
   for (size_t i = 0; i < strides.size(); i++) {
     if (i == strides.size() - 1) {
       strides[i] = ExprHandle(1);
@@ -120,7 +142,8 @@ Stmt* Function::ElementStmt(size_t index) {
 
   const Expr* mask = new IntImm(1);
 
-  Stmt* update_stmt = new Store(func_var(index), total_index.node(), body(index), mask);
+  Stmt* update_stmt =
+      new Store(func_var(index), total_index.node(), body(index), mask);
   return update_stmt;
 }
 
 
@@ -59,6 +59,14 @@ Dtype Intrinsics::IntrinsicsDtype(
   return params[0]->dtype();
 }
 
+Dtype CallExternal::CallExternalDtype(
+    std::string name,
+    const std::vector<const Expr*>& params) {
+  // TODO: check the op_type an dmake a real decision
+  CHECK_GE(params.size(), 1ULL);
+  return params[0]->dtype();
+}
+
 int Intrinsics::OpArgCount(IntrinsicsOp op_type) {
   switch (op_type) {
     case kSin:
@@ -100,39 +108,42 @@ int Intrinsics::OpArgCount(IntrinsicsOp op_type) {
   }
 }
 
-std::vector<const Expr*> ExprHandleVectorToExprVector(const std::vector<ExprHandle>& v) {
+std::vector<const Expr*> ExprHandleVectorToExprVector(
+    const std::vector<ExprHandle>& v) {
   std::vector<const Expr*> result(v.size());
   for (size_t i = 0; i < v.size(); i++) {
     result[i] = v[i].node();
   }
   return result;
 }
 
-std::vector<ExprHandle> ExprVectorToExprHandleVector(const std::vector<const Expr*>& v) {
+std::vector<ExprHandle> ExprVectorToExprHandleVector(
+    const std::vector<const Expr*>& v) {
   std::vector<ExprHandle> result(v.size());
   for (size_t i = 0; i < v.size(); i++) {
     result[i] = ExprHandle(v[i]);
   }
   return result;
 }
 
-std::vector<const Var*> VarHandleVectorToVarVector(const std::vector<VarHandle>& v) {
+std::vector<const Var*> VarHandleVectorToVarVector(
+    const std::vector<VarHandle>& v) {
   std::vector<const Var*> result(v.size());
   for (size_t i = 0; i < v.size(); i++) {
     result[i] = v[i].node();
   }
   return result;
 }
 
-std::vector<VarHandle> VarVectorToVarHandleVector(const std::vector<const Var*>& v) {
+std::vector<VarHandle> VarVectorToVarHandleVector(
+    const std::vector<const Var*>& v) {
   std::vector<VarHandle> result(v.size());
   for (size_t i = 0; i < v.size(); i++) {
     result[i] = VarHandle(v[i]);
   }
   return result;
 }
 
-
 } // namespace tensorexpr
 } // namespace jit
 } // namespace torch
Original file line number	Diff line number	Diff line change
`@@ -59,6 +59,14 @@ Dtype Intrinsics::IntrinsicsDtype(`
`59`	`59`	`return params[0]->dtype();`
`60`	`60`	`}`
`61`	`61`
	`62`	`+Dtype CallExternal::CallExternalDtype(`
	`63`	`+ std::string name,`
	`64`	`+ const std::vector<const Expr*>& params) {`
	`65`	`+ // TODO: check the op_type an dmake a real decision`
	`66`	`+ CHECK_GE(params.size(), 1ULL);`
	`67`	`+ return params[0]->dtype();`
	`68`	`+}`
	`69`	`+`
`62`	`70`	`int Intrinsics::OpArgCount(IntrinsicsOp op_type) {`
`63`	`71`	`switch (op_type) {`
`64`	`72`	`case kSin:`
`@@ -100,39 +108,42 @@ int Intrinsics::OpArgCount(IntrinsicsOp op_type) {`
`100`	`108`	`}`
`101`	`109`	`}`
`102`	`110`
`103`		`-std::vector<const Expr*> ExprHandleVectorToExprVector(const std::vector<ExprHandle>& v) {`
	`111`	`+std::vector<const Expr*> ExprHandleVectorToExprVector(`
	`112`	`+ const std::vector<ExprHandle>& v) {`
`104`	`113`	`std::vector<const Expr*> result(v.size());`
`105`	`114`	`for (size_t i = 0; i < v.size(); i++) {`
`106`	`115`	`result[i] = v[i].node();`
`107`	`116`	`}`
`108`	`117`	`return result;`
`109`	`118`	`}`
`110`	`119`
`111`		`-std::vector<ExprHandle> ExprVectorToExprHandleVector(const std::vector<const Expr*>& v) {`
	`120`	`+std::vector<ExprHandle> ExprVectorToExprHandleVector(`
	`121`	`+ const std::vector<const Expr*>& v) {`
`112`	`122`	`std::vector<ExprHandle> result(v.size());`
`113`	`123`	`for (size_t i = 0; i < v.size(); i++) {`
`114`	`124`	`result[i] = ExprHandle(v[i]);`
`115`	`125`	`}`
`116`	`126`	`return result;`
`117`	`127`	`}`
`118`	`128`
`119`		`-std::vector<const Var*> VarHandleVectorToVarVector(const std::vector<VarHandle>& v) {`
	`129`	`+std::vector<const Var*> VarHandleVectorToVarVector(`
	`130`	`+ const std::vector<VarHandle>& v) {`
`120`	`131`	`std::vector<const Var*> result(v.size());`
`121`	`132`	`for (size_t i = 0; i < v.size(); i++) {`
`122`	`133`	`result[i] = v[i].node();`
`123`	`134`	`}`
`124`	`135`	`return result;`
`125`	`136`	`}`
`126`	`137`
`127`		`-std::vector<VarHandle> VarVectorToVarHandleVector(const std::vector<const Var*>& v) {`
	`138`	`+std::vector<VarHandle> VarVectorToVarHandleVector(`
	`139`	`+ const std::vector<const Var*>& v) {`
`128`	`140`	`std::vector<VarHandle> result(v.size());`
`129`	`141`	`for (size_t i = 0; i < v.size(); i++) {`
`130`	`142`	`result[i] = VarHandle(v[i]);`
`131`	`143`	`}`
`132`	`144`	`return result;`
`133`	`145`	`}`
`134`	`146`
`135`		`-`
`136`	`147`	`} // namespace tensorexpr`
`137`	`148`	`} // namespace jit`
`138`	`149`	`} // namespace torch`