WAYKEN-TSE
diff --git a/‎paddlemix/triton_ops/__init__.py‎
Lines changed: 4 additions & 0 deletions b/‎paddlemix/triton_ops/__init__.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎paddlemix/triton_ops/triton_ops.py‎
Lines changed: 295 additions & 0 deletions b/‎paddlemix/triton_ops/triton_ops.py‎
Lines changed: 295 additions & 0 deletions
diff --git a/‎ppdiffusers/deploy/sd3/README.md‎
Lines changed: 31 additions & 0 deletions b/‎ppdiffusers/deploy/sd3/README.md‎
Lines changed: 31 additions & 0 deletions
@@ -21,6 +21,8 @@
         fused_rotary_emb,
         paddle_use_triton,
         rms_norm,
+        split_concat,
+        triton_split,
         weight_only_int8,
     )
     from .triton_utils import (
@@ -39,6 +41,8 @@
         "rms_norm",
         "get_dtype_str",
         "fused_rotary_emb",
+        "split_concat",
+        "triton_split",
     ]
 except:
     pass
@@ -1580,3 +1580,298 @@ def fused_rotary_emb(
             outputs={"q_out": q_out, "k_out": k_out, "v_out": v_out},
         )
         return q_out, k_out, v_out
+
+
+########################### split concat ###############################
+split_concat_template = (
+    """
+std::vector<paddle::Tensor> ${op_name}_func(
+    const paddle::Tensor &x,
+    const paddle::Tensor &y) {
+
+  int batch = x.dims()[0];
+  
+  int seq_qkv = x.dims()[1];
+  int seq_eqkv = y.dims()[1];
+  int output_hidden = x.dims()[2] / 3;
+  
+  
+  auto qkv = get_tensor_ptr(x);
+  auto eqkv = get_tensor_ptr(y);
+  
+  
+  auto out0_tensor = paddle::empty({batch, seq_qkv+seq_eqkv, output_hidden}, x.dtype(), x.place());
+  auto out1_tensor = paddle::empty({batch, seq_qkv+seq_eqkv, output_hidden}, x.dtype(), x.place());
+  auto out2_tensor = paddle::empty({batch, seq_qkv+seq_eqkv, output_hidden}, x.dtype(), x.place());
+  
+  auto out0 = get_tensor_ptr(out0_tensor);
+  auto out1 = get_tensor_ptr(out1_tensor);
+  auto out2 = get_tensor_ptr(out2_tensor);
+  
+  
+  auto  run_stream = out0_tensor.stream();
+  
+"""
+    + tune_and_invoke_part
+    + """
+    return {out0_tensor, out1_tensor, out2_tensor};
+}
+
+std::vector<std::vector<int64_t>> ${op_name}_InferShape(
+        const std::vector<int64_t>& A_shape, const std::vector<int64_t>& B_shape) {
+  
+  int64_t seq1 = A_shape[1];
+  int64_t seq2 = B_shape[1];
+  int64_t seq = -1;
+  if (seq1 > 0 && seq2 > 0){
+    seq = seq1 + seq2;
+  }
+  std::vector<int64_t> out_shape = {A_shape[0], seq, A_shape[2]/3};
+  
+  return {out_shape, out_shape, out_shape};
+}
+
+std::vector<paddle::DataType> ${op_name}_InferDtype(const paddle::DataType& A_dtype) {
+    return {A_dtype, A_dtype, A_dtype};
+}
+
+PD_BUILD_OP(${op_name})
+    .Inputs({"x", "y"})
+    .Outputs({"out0_tensor", "out1_tensor", "out2_tensor"})
+    .SetKernelFn(PD_KERNEL(${op_name}_func))
+    .SetInferDtypeFn(PD_INFER_DTYPE(${op_name}_InferDtype))
+    .SetInferShapeFn(PD_INFER_SHAPE(${op_name}_InferShape));
+"""
+)
+
+
+@paddle_use_triton(
+    custom_op_template=split_concat_template,
+    key=["1"],
+)
+def split_concat_kernel(
+    out0,
+    out1,
+    out2,
+    qkv,
+    eqkv,
+    batch,
+    seq_qkv,
+    seq_eqkv,
+    output_hidden,
+    BLOCK_SIZE: tl.constexpr,
+):
+    out_id = tl.program_id(axis=0)
+    batch = tl.program_id(axis=1)
+    out_row = tl.program_id(axis=2)
+    if out_row < seq_qkv:
+        read_ptr = out_id * output_hidden + out_row * 3 * output_hidden + batch * seq_qkv * output_hidden * 3 + qkv
+    else:
+        read_ptr = (
+            out_id * output_hidden
+            + (out_row - seq_qkv) * 3 * output_hidden
+            + batch * seq_eqkv * output_hidden * 3
+            + eqkv
+        )
+
+    read_offsets = tl.arange(0, BLOCK_SIZE)
+    mask = read_offsets < output_hidden
+    read_data = tl.load(read_ptr + read_offsets, mask=mask)
+
+    real_output = out0
+    if out_id == 1:
+        real_output = out1
+    elif out_id == 2:
+        real_output = out2
+
+    write_ptr = batch * (seq_qkv + seq_eqkv) * output_hidden + out_row * output_hidden + real_output + read_offsets
+
+    tl.store(write_ptr, read_data, mask=mask)
+
+
+def split_concat(x, y):
+    assert len(x.shape) == 3
+    assert len(y.shape) == 3
+
+    assert x.shape[0] == y.shape[0]
+    assert x.shape[2] == y.shape[2]
+
+    batch = x.shape[0]
+    seq_qkv = x.shape[1]
+    hidd_x = x.shape[2]
+    seq_eqkv = y.shape[1]
+    ouput_hidden = hidd_x // 3
+    BLOCK_SIZE = triton.next_power_of_2(ouput_hidden)
+    op_name = "split_concat"
+    op_name += get_dtype_str(x.dtype)
+    op_name += f"_{BLOCK_SIZE}"
+
+    if op_name not in OpProtoHolder.instance().op_proto_map.keys():
+        out0 = paddle.empty(shape=[batch, seq_qkv + seq_eqkv, ouput_hidden], dtype=x.dtype)
+        out1 = paddle.empty(shape=[batch, seq_qkv + seq_eqkv, ouput_hidden], dtype=x.dtype)
+        out2 = paddle.empty(shape=[batch, seq_qkv + seq_eqkv, ouput_hidden], dtype=x.dtype)
+        grid = ("3", "batch", "seq_qkv + seq_eqkv")
+
+        split_concat_kernel[(op_name, grid)](
+            out0, out1, out2, x, y, batch, seq_qkv, seq_eqkv, ouput_hidden, BLOCK_SIZE=BLOCK_SIZE
+        )
+
+    if in_dynamic_or_pir_mode():
+        print(f"== we are in dynamic mode, op_name: {op_name}")
+        outs = _C_ops._run_custom_op(
+            op_name,
+            x,
+            y,
+        )
+        return outs[0], outs[1], outs[2]
+    else:
+        print(f"== we are in dynamic to static mode, op_name: {op_name}")
+        helper = LayerHelper(op_name, **locals())
+        inputs = {
+            "x": x,
+            "y": y,
+        }
+        out0 = helper.create_variable_for_type_inference(dtype=x.dtype)
+        out1 = helper.create_variable_for_type_inference(dtype=x.dtype)
+        out2 = helper.create_variable_for_type_inference(dtype=x.dtype)
+
+        helper.append_op(
+            type=op_name,
+            inputs=inputs,
+            outputs={"out0_tensor": out0, "out1_tensor": out1, "out2_tensor": out2},
+        )
+        return out0, out1, out2
+
+
+########################### triton split ###############################
+triton_split_template = (
+    """
+std::vector<paddle::Tensor> ${op_name}_func(
+    const paddle::Tensor &x,
+    const std::vector<int64_t> num_or_sections,
+    const int64_t axis) {
+
+  int output_batch = x.dims()[0];
+  int output_seq0 = num_or_sections[0];
+  int output_seq1 = num_or_sections[1];
+  int output_hidden = x.dims()[2];
+
+  auto out0_tensor = paddle::empty({output_batch, output_seq0, output_hidden}, x.dtype(), x.place());
+  auto out1_tensor = paddle::empty({output_batch, output_seq1, output_hidden}, x.dtype(), x.place());
+  
+  auto out0 = get_tensor_ptr(out0_tensor);
+  auto out1 = get_tensor_ptr(out1_tensor);
+  
+  auto input = get_tensor_ptr(x);
+  
+  auto  run_stream = out0_tensor.stream();
+  
+"""
+    + tune_and_invoke_part
+    + """
+    return {out0_tensor, out1_tensor};
+}
+
+std::vector<std::vector<int64_t>> ${op_name}_InferShape(
+        const std::vector<int64_t>& A_shape) {
+  
+  std::vector<int64_t> out_shape0 = {A_shape[0], 1024, A_shape[2]};
+  std::vector<int64_t> out_shape1 = {A_shape[0], 154, A_shape[2]};
+  
+  return {out_shape0, out_shape1};
+}
+
+std::vector<paddle::DataType> ${op_name}_InferDtype(const paddle::DataType& A_dtype) {
+    return {A_dtype, A_dtype};
+}
+
+PD_BUILD_OP(${op_name})
+    .Inputs({"x"})
+    .Outputs({"out0_tensor", "out1_tensor"})
+    .SetKernelFn(PD_KERNEL(${op_name}_func))
+    .Attrs({"num_or_sections: std::vector<int64_t>", "axis: int64_t"})
+    .SetInferDtypeFn(PD_INFER_DTYPE(${op_name}_InferDtype))
+    .SetInferShapeFn(PD_INFER_SHAPE(${op_name}_InferShape));
+"""
+)
+
+
+@paddle_use_triton(
+    custom_op_template=triton_split_template,
+    key=["1"],
+)
+def triton_split_kernel(
+    out0,
+    out1,
+    input,
+    output_seq0,
+    output_seq1,
+    output_batch,
+    output_hidden,
+    BLOCK_SIZE: tl.constexpr,
+):
+    batch = tl.program_id(axis=0)
+    out_row = tl.program_id(axis=1)
+    read_ptr = out_row * output_hidden + batch * (output_seq0 + output_seq1) * output_hidden + input
+
+    read_offsets = tl.arange(0, BLOCK_SIZE)
+    mask = read_offsets < output_hidden
+    read_data = tl.load(read_ptr + read_offsets, mask=mask)
+
+    if out_row < output_seq0:
+        write_ptr = batch * output_seq0 * output_hidden + out_row * output_hidden + out0 + read_offsets
+    else:
+        write_ptr = batch * output_seq1 * output_hidden + (out_row - output_seq0) * output_hidden + out1 + read_offsets
+
+    tl.store(write_ptr, read_data, mask=mask)
+
+
+def triton_split(x, num_or_sections=[-1, -1], axis=1):
+    assert len(x.shape) == 3
+    output_batch = x.shape[0]
+    output_seq0 = num_or_sections[0]
+    output_seq1 = num_or_sections[1]
+    output_hidden = x.shape[2]
+
+    BLOCK_SIZE = triton.next_power_of_2(output_hidden)
+    op_name = "triton_split"
+    op_name += get_dtype_str(x.dtype)
+    op_name += f"_{BLOCK_SIZE}"
+
+    if op_name not in OpProtoHolder.instance().op_proto_map.keys():
+        out0 = paddle.empty(shape=[output_batch, output_seq0, output_hidden], dtype=x.dtype)
+        out1 = paddle.empty(shape=[output_batch, output_seq1, output_hidden], dtype=x.dtype)
+        grid = ("output_batch", "output_seq0+output_seq1")
+
+        triton_split_kernel[(op_name, grid)](
+            out0, out1, x, output_seq0, output_seq1, output_batch, output_hidden, BLOCK_SIZE=2048
+        )
+
+    if in_dynamic_or_pir_mode():
+        print(f"== we are in dynamic mode, op_name: {op_name}")
+        outs = _C_ops._run_custom_op(
+            op_name,
+            x,
+            num_or_sections,
+            axis,
+        )
+        return outs[0], outs[1]
+    else:
+        print(f"== we are in dynamic to static mode, op_name: {op_name}")
+        helper = LayerHelper(op_name, **locals())
+        inputs = {
+            "x": x,
+        }
+        out0 = helper.create_variable_for_type_inference(dtype=x.dtype)
+        out1 = helper.create_variable_for_type_inference(dtype=x.dtype)
+
+        helper.append_op(
+            type=op_name,
+            inputs=inputs,
+            attrs={
+                "num_or_sections": num_or_sections,
+                "axis": axis,
+            },
+            outputs={"out0_tensor": out0, "out1_tensor": out1},
+        )
+        return out0, out1
@@ -0,0 +1,31 @@
+# Stable Diffusion 3 高性能推理
+
+- Paddle Inference提供Stable Diffusion 3 模型高性能推理实现，推理性能提升70%+
+环境准备：
+```shell
+# 安装 triton并适配paddle
+python -m pip install triton
+python -m pip install git+https://github.com/zhoutianzi666/UseTritonInPaddle.git
+python -c "import use_triton_in_paddle; use_triton_in_paddle.make_triton_compatible_with_paddle()"
+
+# 安装develop版本的paddle，请根据自己的cuda版本选择对应的paddle版本，这里选择12.3的cuda版本
+python -m pip install --pre paddlepaddle-gpu -i https://www.paddlepaddle.org.cn/packages/nightly/cu123/
+
+# 指定 libCutlassGemmEpilogue.so 的路径
+# 详情请参考 https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/phi/kernels/fusion/cutlass/gemm_epilogue/README.md
+export LD_LIBRARY_PATH=/your_dir/Paddle/paddle/phi/kernels/fusion/cutlass/gemm_epilogue/build:$LD_LIBRARY_PATH
+```
+
+高性能推理指令：
+```shell
+# 执行FP16推理
+python  text_to_image_generation-stable_diffusion_3.py  --dtype float16 --height 512 --width 512 \
+--num-inference-steps 50 --inference_optimize 1  \
+--benchmark 1
+```
+
+- 在 NVIDIA A100-SXM4-40GB 上测试的性能如下：
+
+| Paddle Inference|    PyTorch   | Paddle 动态图 |
+| --------------- | ------------ | ------------ |
+|       1.2 s     |     1.78 s   |    4.202 s   |