intel
diff --git a/‎README.md
Lines changed: 4 additions & 0 deletions b/‎README.md
Lines changed: 4 additions & 0 deletions
diff --git a/‎bin/RegisterTritonDialects.h
Lines changed: 1 addition & 0 deletions b/‎bin/RegisterTritonDialects.h
Lines changed: 1 addition & 0 deletions
diff --git a/‎cmake/llvm-hash.txt
Lines changed: 1 addition & 1 deletion b/‎cmake/llvm-hash.txt
Lines changed: 1 addition & 1 deletion
diff --git a/‎include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td
Lines changed: 1 addition & 1 deletion b/‎include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td
Lines changed: 1 addition & 1 deletion
diff --git a/‎include/triton/Dialect/TritonGPU/Transforms/PipelineExpander.h
Lines changed: 2 additions & 2 deletions b/‎include/triton/Dialect/TritonGPU/Transforms/PipelineExpander.h
Lines changed: 2 additions & 2 deletions
diff --git a/‎include/triton/Dialect/TritonNvidiaGPU/Transforms/TMAUtilities.h
Lines changed: 8 additions & 3 deletions b/‎include/triton/Dialect/TritonNvidiaGPU/Transforms/TMAUtilities.h
Lines changed: 8 additions & 3 deletions
diff --git a/‎include/triton/Tools/Sys/GetEnv.hpp
Lines changed: 1 addition & 0 deletions b/‎include/triton/Tools/Sys/GetEnv.hpp
Lines changed: 1 addition & 0 deletions
diff --git a/‎lib/Conversion/TritonGPUToLLVM/ViewOpToLLVM.cpp
Lines changed: 45 additions & 7 deletions b/‎lib/Conversion/TritonGPUToLLVM/ViewOpToLLVM.cpp
Lines changed: 45 additions & 7 deletions
diff --git a/‎lib/Dialect/TritonGPU/IR/Dialect.cpp
Lines changed: 1 addition & 1 deletion b/‎lib/Dialect/TritonGPU/IR/Dialect.cpp
Lines changed: 1 addition & 1 deletion
diff --git a/‎lib/Dialect/TritonNvidiaGPU/Transforms/OptimizeDescriptorEncoding.cpp
Lines changed: 69 additions & 26 deletions b/‎lib/Dialect/TritonNvidiaGPU/Transforms/OptimizeDescriptorEncoding.cpp
Lines changed: 69 additions & 26 deletions
@@ -138,6 +138,10 @@ arbitrary LLVM version.
   during the build. By default, this is the user's home directory. It
   can be changed anytime.
 
+- If you're running out of memory when building Triton, specify the `MAX_JOBS`
+  environment variable (to the `pip install -e python` command) to limit the
+  number of jobs.
+
 - Pass `--no-build-isolation` to `pip install` to make nop builds faster.
   Without this, every invocation of `pip install` uses a different symlink to
   cmake, and this forces ninja to rebuild most of the `.a` files.
 
@@ -97,6 +97,7 @@ inline void registerTritonDialects(mlir::DialectRegistry &registry) {
   mlir::registerTritonAMDGPUCanonicalizePointers();
   mlir::registerTritonAMDGPUConvertToBufferOps();
   mlir::registerTritonAMDGPUInThreadTranspose();
+  mlir::registerTritonAMDGPUCoalesceAsyncCopy();
   mlir::triton::registerTritonAMDGPUInsertInstructionSchedHints();
   mlir::triton::registerTritonAMDGPULowerInstructionSchedHints();
 
 
@@ -1 +1 @@
-2619c2ed584cdf3b38e6743ed3c785223f06e3f7
+0ea4fb92648b2aa7cbab486bb493e122b4dcc062
@@ -430,7 +430,7 @@ def NVMMASharedEncodingAttr :
         } else if (contigDimSizeInByte >= 32 && contigDimSizeInByte % 32 == 0) {
           swizzlingByteWidth = 32;
         } else {
-          llvm_unreachable("unsupported shared memory layout for MMAv3");
+          llvm_unreachable("unsupported NVMMA layout (MMAv3 or TMA)");
         }
         bool transposed = order[0] == 0;
         return $_get(context, swizzlingByteWidth, transposed, eleBitWidth, fp4Padded, CTALayout);
 
@@ -25,7 +25,7 @@ namespace triton {
 
 /// Options to dictate how loops should be pipelined.
 struct PipeliningOption {
-  /// Lambda returning all the operation in the forOp, with their stage, in the
+  /// Lambda returning all the operations in the forOp, with their stage, in the
   /// order picked for the pipelined loop.
   using GetScheduleFnType = std::function<void(
       scf::ForOp, std::vector<std::pair<Operation *, unsigned>> &)>;
@@ -54,7 +54,7 @@ struct PipeliningOption {
   /// Control whether the transformation checks that the number of iterations is
   /// greater or equal to the number of stages and skip the transformation if
   /// this is not the case. If the loop is dynamic and this is set to true the
-  /// pipeliner will have to predicate operations in the the prologue/epilogue.
+  /// pipeliner will have to predicate operations in the prologue/epilogue.
   bool supportDynamicLoops = false;
 
   // Callback to predicate operations when the prologue or epilogue are not
 
@@ -156,11 +156,16 @@ mlir::LogicalResult createTMADesc(mlir::Value tmaPtr,
              "elem type .b4x16_p64 supports only 128B swizzling");
     }
   } else {
-    op->emitError() << "Unhandled encoding type";
-    return failure();
+    auto swizzledEnc = dyn_cast<gpu::SwizzledSharedEncodingAttr>(
+        op.getType().getBlockType().getEncoding());
+    if (!swizzledEnc || swizzledEnc.getVec() != 1 ||
+        swizzledEnc.getPerPhase() != 1 || swizzledEnc.getMaxPhase() != 1) {
+      op->emitError() << "Unhandled encoding type";
+      return failure();
+    }
   }
 
-  int32_t swizzle_mode;
+  int32_t swizzle_mode = 0;
   if (swizzleBytes == 128) {
     swizzle_mode = 3;
   } else if (swizzleBytes == 64) {
 
@@ -33,6 +33,7 @@ inline const std::set<std::string> CACHE_INVALIDATING_ENV_VARS = {
     "TRITON_ENABLE_LLVM_DEBUG",
     "TRITON_HIP_GLOBAL_PREFETCH",
     "TRITON_HIP_LOCAL_PREFETCH",
+    "TRITON_HIP_USE_ASYNC_COPY",
     "TRITON_HIP_USE_BLOCK_PINGPONG",
     "TRITON_HIP_USE_IN_THREAD_TRANSPOSE",
     "TRITON_LLVM_DEBUG_ONLY",
 
@@ -392,8 +392,10 @@ struct MemDescSubviewOpConversion
     Location loc = op->getLoc();
     auto b = TritonLLVMOpBuilder(loc, rewriter);
     auto srcTy = op.getSrc().getType();
+    auto destTy = op.getResult().getType();
     auto llvmElemTy = getTypeConverter()->convertType(srcTy.getElementType());
     auto layoutOrder = getOrder(srcTy);
+    auto enc = srcTy.getEncoding();
 
     // newBase = base + offset
     auto smemObj = getSharedMemoryObjectFromStruct(loc, adaptor.getSrc(),
@@ -408,13 +410,49 @@ struct MemDescSubviewOpConversion
     for (int i = rankReduced; i < opOffsetVals.size(); i++) {
       offsetVals.push_back(b.add(opOffsetVals[i], smemObj.getOffsets()[i]));
     }
-    // Compute the offset based on the original strides of the shared memory
-    // object
-    auto offset = dot(rewriter, loc, opOffsetVals, opSmemStrides);
-    auto elemPtrTy = smemObj.getBase().getType();
-    smemObj = SharedMemoryObject(
-        b.gep(elemPtrTy, llvmElemTy, smemObj.getBase(), offset), llvmElemTy,
-        offsetVals);
+    Value offset = b.undef(i32_ty);
+    auto allocShape = srcTy.getAllocShape();
+    bool isSimpleSubview =
+        allocShape.take_back(destRank) == destTy.getShape() ||
+        !isa<NVMMASharedEncodingAttr>(enc);
+    if (!isSimpleSubview) {
+      auto nvmmaEnc = cast<NVMMASharedEncodingAttr>(enc);
+      assert(destRank >= 2 &&
+             "Shape size should be >= 2 when using NVMMAShared encoding");
+      auto swizzleStride = b.i32_val((nvmmaEnc.getSwizzlingByteWidth() * 8) /
+                                     llvmElemTy.getIntOrFloatBitWidth());
+      offset = b.i32_val(0);
+      for (auto i = 0; i < opOffsetVals.size() - 2; ++i) {
+        offset = b.add(offset, b.mul(opOffsetVals[i], opSmemStrides[i]));
+      }
+      // newOffset = offset - (stridedOff * swizzledStride + contigOff /
+      // swizzledStride * tileSize + contigOff % swizzledStride)
+      // + stridedInc * swizzledStride + contigInc / swizzledStride *
+      // tileSize + contigInc % swizzledStride
+      auto stridedDim = destRank - 1 - layoutOrder[0];
+      auto contigDim = destRank - 1 - layoutOrder[1];
+      auto stridedOff = smemObj.getOffsets()[stridedDim];
+      auto contigOff = smemObj.getOffsets()[contigDim];
+      auto stridedInc = offsetVals[stridedDim];
+      auto contigInc = offsetVals[contigDim];
+      int allocStridedDim = allocShape.size() - 1 - layoutOrder[0];
+      auto tileSize =
+          b.mul(b.i32_val(allocShape[allocStridedDim]), swizzleStride);
+      offset = b.sub(offset, b.mul(stridedOff, swizzleStride));
+      offset = b.sub(offset, b.mul(b.udiv(contigOff, swizzleStride), tileSize));
+      offset = b.sub(offset, b.urem(contigOff, swizzleStride));
+      offset = b.add(offset, b.mul(stridedInc, swizzleStride));
+      offset = b.add(offset, b.mul(b.udiv(contigInc, swizzleStride), tileSize));
+      offset = b.add(offset, b.urem(contigInc, swizzleStride));
+    } else {
+      // Compute the offset based on the original strides of the shared memory
+      // object
+      offset = dot(rewriter, loc, opOffsetVals, opSmemStrides);
+    }
+    auto base = smemObj.getBase();
+    auto elemPtrTy = base.getType();
+    smemObj = SharedMemoryObject(b.gep(elemPtrTy, llvmElemTy, base, offset),
+                                 llvmElemTy, offsetVals);
     auto retVal = getStructFromSharedMemoryObject(loc, smemObj, rewriter);
     rewriter.replaceOp(op, retVal);
     return success();
 
@@ -1203,7 +1203,7 @@ LinearEncodingAttr::orderPerDim(StringAttr dimName,
 // [Note. Divergence of methods wrt. legacy layouts]
 // For smaller shapes where the CTATile is larger than the output
 // tensor, some methods return different values than the legacy layouts. I think
-// this is benign tho. An example: what is the the vector of `warpsPerCTA` if
+// this is benign tho. An example: what is the vector of `warpsPerCTA` if
 // all the warps hold the same data? I think it should be [1, 1], even if we
 // have 4 warps. But perhaps for this we have to add some masking in some
 // places... We'll see
 
@@ -1,22 +1,16 @@
 #include "mlir/IR/TypeUtilities.h"
 #include "mlir/Pass/PassManager.h"
-#include "mlir/Transforms/Passes.h"
-#include "triton/Analysis/AxisInfo.h"
 #include "triton/Dialect/Triton/IR/Dialect.h"
 #include "triton/Dialect/Triton/IR/Types.h"
 #include "triton/Dialect/TritonGPU/IR/Attributes.h"
 #include "triton/Dialect/TritonGPU/IR/Dialect.h"
 #include "triton/Dialect/TritonGPU/IR/TritonGPUInterfaces.h"
-#include "triton/Dialect/TritonGPU/Transforms/Passes.h"
 #include "triton/Dialect/TritonGPU/Transforms/Utility.h"
 #include "triton/Dialect/TritonNvidiaGPU/IR/Dialect.h"
 #include "triton/Dialect/TritonNvidiaGPU/Transforms/Passes.h"
 #include "triton/Dialect/TritonNvidiaGPU/Transforms/TMAUtilities.h"
-#include "triton/Tools/Sys/GetEnv.hpp"
 #include "llvm/ADT/PriorityWorklist.h"
-#include "llvm/ADT/Sequence.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/VersionTuple.h"
+#include <algorithm>
 #include <memory>
 #include <unordered_set>
 
@@ -35,6 +29,7 @@ struct UseInfo {
   TypedValue<tt::TensorDescType> descriptor;
   Operation *use;
   Attribute desiredSharedEncoding;
+  SmallVector<int64_t> shape;
   ttg::CTALayoutAttr ctaLayout;
 };
 
@@ -72,6 +67,14 @@ ttg::CTALayoutAttr getCtaLayoutFromEncoding(Attribute encoding) {
                                  layout.getCTASplitNum(), layout.getCTAOrder());
 }
 
+SmallVector<int64_t> expandToRank(ArrayRef<int64_t> shape, int rank) {
+  SmallVector<int64_t> result(rank, 1);
+  assert(shape.size() <= rank);
+  auto rankDiff = rank - shape.size();
+  std::copy(shape.begin(), shape.end(), result.begin() + rankDiff);
+  return result;
+}
+
 std::optional<UseInfo> getUseInfo(Operation *op) {
   UseInfo info;
   info.use = op;
@@ -81,6 +84,9 @@ std::optional<UseInfo> getUseInfo(Operation *op) {
     auto encoding = info.desiredSharedEncoding ? info.desiredSharedEncoding
                                                : load.getType().getEncoding();
     info.ctaLayout = ttg::getCTALayout(encoding);
+    auto shape = load.getResult().getType().getShape();
+    auto rank = load.getDesc().getType().getBlockType().getRank();
+    info.shape = expandToRank(shape, rank);
     return info;
   }
   if (auto gather = dyn_cast<tt::DescriptorGatherOp>(op)) {
@@ -89,18 +95,27 @@ std::optional<UseInfo> getUseInfo(Operation *op) {
     auto encoding = info.desiredSharedEncoding ? info.desiredSharedEncoding
                                                : gather.getType().getEncoding();
     info.ctaLayout = ttg::getCTALayout(encoding);
+    auto shape = gather.getResult().getType().getShape();
+    auto rank = gather.getDesc().getType().getBlockType().getRank();
+    info.shape = expandToRank(shape, rank);
     return info;
   }
   if (auto store = dyn_cast<tt::DescriptorStoreOp>(op)) {
     info.descriptor = store.getDesc();
     auto encoding = store.getSrc().getType().getEncoding();
     info.ctaLayout = ttg::getCTALayout(encoding);
+    auto shape = store.getSrc().getType().getShape();
+    auto rank = store.getDesc().getType().getBlockType().getRank();
+    info.shape = expandToRank(shape, rank);
     return info;
   }
   if (auto scatter = dyn_cast<tt::DescriptorScatterOp>(op)) {
     info.descriptor = scatter.getDesc();
     auto encoding = scatter.getSrc().getType().getEncoding();
     info.ctaLayout = ttg::getCTALayout(encoding);
+    auto shape = scatter.getSrc().getType().getShape();
+    auto rank = scatter.getDesc().getType().getBlockType().getRank();
+    info.shape = expandToRank(shape, rank);
     return info;
   }
   return std::nullopt;
@@ -109,12 +124,15 @@ std::optional<UseInfo> getUseInfo(Operation *op) {
 struct EncodingInfo {
   Attribute desiredEncoding;
   ttg::CTALayoutAttr ctaLayout;
+  // Shape may be different from the descriptor block shape for gather/scatter
+  // use case
+  SmallVector<int64_t> shape;
   bool forcedToDefault = false;
 
   bool operator==(const EncodingInfo &other) const {
     return desiredEncoding == other.desiredEncoding &&
            ctaLayout == other.ctaLayout &&
-           forcedToDefault == other.forcedToDefault;
+           forcedToDefault == other.forcedToDefault && shape == other.shape;
   }
 };
 
@@ -123,7 +141,8 @@ struct EncodingInfo {
 template <> struct std::hash<EncodingInfo> {
   size_t operator()(const EncodingInfo &einfo) const {
     return llvm::hash_combine(einfo.desiredEncoding, einfo.ctaLayout,
-                              einfo.forcedToDefault);
+                              einfo.forcedToDefault,
+                              ArrayRef<int64_t>(einfo.shape));
   }
 };
 
@@ -172,6 +191,21 @@ EncodingInfo combineEncodings(const EncodingInfo &lhs, const EncodingInfo &rhs,
   // Always propagate forcedToDefault
   result.forcedToDefault = lhs.forcedToDefault || rhs.forcedToDefault;
 
+  if (result.forcedToDefault)
+    return result;
+
+  if (lhs.shape.empty() || lhs.shape == rhs.shape)
+    result.shape = rhs.shape;
+  else if (rhs.shape.empty())
+    result.shape = lhs.shape;
+  else {
+    assert(lhs.shape.size() == rhs.shape.size());
+    auto rank = lhs.shape.size();
+    result.shape.reserve(rank);
+    for (int i = 0; i < rank; ++i)
+      result.shape.push_back(std::min(lhs.shape[i], rhs.shape[i]));
+  }
+
   SetVector<ttg::CTALayoutAttr> ctaLayouts;
   if (lhs.ctaLayout)
     ctaLayouts.insert(lhs.ctaLayout);
@@ -190,9 +224,6 @@ EncodingInfo combineEncodings(const EncodingInfo &lhs, const EncodingInfo &rhs,
     break;
   }
 
-  if (result.forcedToDefault)
-    return result;
-
   SetVector<Attribute> desiredEncodings;
   if (lhs.desiredEncoding)
     desiredEncodings.insert(lhs.desiredEncoding);
@@ -213,23 +244,32 @@ EncodingInfo combineEncodings(const EncodingInfo &lhs, const EncodingInfo &rhs,
 }
 
 Attribute getFallbackSharedEncoding(RankedTensorType tensorType,
-                                    ttg::CTALayoutAttr ctaLayout) {
+                                    ttg::CTALayoutAttr ctaLayout,
+                                    ArrayRef<int64_t> usageShape) {
   auto ctx = tensorType.getContext();
   SmallVector<unsigned> order;
   for (int i = tensorType.getRank() - 1; i >= 0; --i)
     order.push_back(i);
 
+  ArrayRef<int64_t> shape =
+      usageShape.empty() ? tensorType.getShape() : usageShape;
   if (!ctaLayout)
     ctaLayout = ttg::CTALayoutAttr::getDefault(ctx, tensorType.getRank());
   else if (ctaLayout.getRank() != tensorType.getRank())
-    ctaLayout = ttng::updateCTALayoutForShape(ctaLayout, tensorType.getShape());
+    ctaLayout = ttng::updateCTALayoutForShape(ctaLayout, shape);
+
+  auto elemTy = tensorType.getElementType();
+  auto shapePerCTA = ttg::getShapePerCTA(ctaLayout.getCTASplitNum(), shape);
+  unsigned eleBitWidth = tensorType.getElementType().getIntOrFloatBitWidth();
 
-  if (tensorType.getRank() == 1) {
+  auto contigDimSizeInBytes = shapePerCTA.back() * eleBitWidth / 8;
+  auto rank = tensorType.getRank();
+  if (rank == 1 || contigDimSizeInBytes < 32 || shape[rank - 2] < 8) {
     return ttg::SwizzledSharedEncodingAttr::get(ctx, 1, 1, 1, order, ctaLayout);
   }
-  return ttg::NVMMASharedEncodingAttr::get(
-      ctx, tensorType.getShape(), order, ctaLayout, tensorType.getElementType(),
-      /*fp4Padded*/ false);
+  return ttg::NVMMASharedEncodingAttr::get(ctx, shape, order, ctaLayout,
+                                           tensorType.getElementType(),
+                                           /*fp4Padded*/ false);
 }
 
 tt::TensorDescType getTensorDescTypeWithEncoding(Operation *op,
@@ -274,17 +314,19 @@ void assignMemoryLayouts(tt::FuncOp &func) {
   // fallback to default encoding
   for (auto blockArg : func.getBlocks().front().getArguments())
     if (auto desc = dyn_cast<TypedValue<tt::TensorDescType>>(blockArg))
-      updateEncoding({desc}, EncodingInfo{{}, {}, /*forcedToDefault=*/true});
+      updateEncoding({desc},
+                     EncodingInfo{{}, {}, {}, /*forcedToDefault=*/true});
 
   func.walk([&](Operation *op) {
     if (auto info = getUseInfo(op)) {
-      updateEncoding(info->descriptor, EncodingInfo{info->desiredSharedEncoding,
-                                                    info->ctaLayout});
+      updateEncoding(info->descriptor,
+                     EncodingInfo{info->desiredSharedEncoding, info->ctaLayout,
+                                  info->shape});
     } else {
       bool forcedToDefault =
           isa<tt::CallOp, tt::ReturnOp, tt::ReinterpretTensorDescOp>(op);
       auto einfo =
-          internEncoding(encodings, EncodingInfo{{}, {}, forcedToDefault});
+          internEncoding(encodings, EncodingInfo{{}, {}, {}, forcedToDefault});
 
       auto setEncoding = [&](Value v) {
         auto typedVal = cast<TypedValue<tt::TensorDescType>>(v);
@@ -344,9 +386,10 @@ void assignMemoryLayouts(tt::FuncOp &func) {
     if (einfo->desiredEncoding) {
       newEncoding = einfo->desiredEncoding;
     } else if (einfo->forcedToDefault) {
-      newEncoding = getFallbackSharedEncoding(existingTy, {});
+      newEncoding = getFallbackSharedEncoding(existingTy, {}, {});
     } else {
-      newEncoding = getFallbackSharedEncoding(existingTy, einfo->ctaLayout);
+      newEncoding =
+          getFallbackSharedEncoding(existingTy, einfo->ctaLayout, einfo->shape);
     }
     desc.setType(getTensorDescTypeWithEncoding(desc.getDefiningOp(), existingTy,
                                                newEncoding));
@@ -356,14 +399,14 @@ void assignMemoryLayouts(tt::FuncOp &func) {
   SmallVector<Type> resultTys(func.getResultTypes());
   for (auto [i, argTy] : llvm::enumerate(argTys)) {
     if (auto descTy = dyn_cast<tt::TensorDescType>(argTy)) {
-      auto encoding = getFallbackSharedEncoding(descTy.getBlockType(), {});
+      auto encoding = getFallbackSharedEncoding(descTy.getBlockType(), {}, {});
       argTys[i] = getTensorDescTypeWithEncoding(nullptr, descTy.getBlockType(),
                                                 encoding);
     }
   }
   for (auto [i, resultTy] : llvm::enumerate(resultTys)) {
     if (auto descTy = dyn_cast<tt::TensorDescType>(resultTy)) {
-      auto encoding = getFallbackSharedEncoding(descTy.getBlockType(), {});
+      auto encoding = getFallbackSharedEncoding(descTy.getBlockType(), {}, {});
       resultTys[i] = getTensorDescTypeWithEncoding(
           nullptr, descTy.getBlockType(), encoding);
     }
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-2619c2ed584cdf3b38e6743ed3c785223f06e3f7`
	`1`	`+0ea4fb92648b2aa7cbab486bb493e122b4dcc062`
Original file line number	Diff line number	Diff line change
`@@ -430,7 +430,7 @@ def NVMMASharedEncodingAttr :`
`430`	`430`	`} else if (contigDimSizeInByte >= 32 && contigDimSizeInByte % 32 == 0) {`
`431`	`431`	`swizzlingByteWidth = 32;`
`432`	`432`	`} else {`
`433`		`- llvm_unreachable("unsupported shared memory layout for MMAv3");`
	`433`	`+ llvm_unreachable("unsupported NVMMA layout (MMAv3 or TMA)");`
`434`	`434`	`}`
`435`	`435`	`bool transposed = order[0] == 0;`
`436`	`436`	`return $_get(context, swizzlingByteWidth, transposed, eleBitWidth, fp4Padded, CTALayout);`