Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion oneflow/core/functional/functional_api.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2789,7 +2789,7 @@
bind_python: False

- name: "nms"
signature: "Tensor (Tensor x, Float iou_threshold, Int32 keep_n=-1) => Nms"
signature: "Tensor (Tensor x, Tensor scores=None, Tensor input_indices=None, Float iou_threshold, Int32 keep_n=-1) => Nms"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

看看是否需要/方便,为npu的nms导出独立的api/functor?

bind_python: True

- name: "roi_align"
Expand Down
20 changes: 19 additions & 1 deletion oneflow/core/functional/impl/array_functor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -588,7 +588,25 @@ class ArgWhereFunctor {
const Symbol<DType>& dtype) const {
auto& attrs = THREAD_CACHED_MUTABLE_ATTR_MAP("dtype");
attrs.SetAllAttrs(dtype->data_type());
return OpInterpUtil::Dispatch<TensorTuple>(*op_, {x}, attrs);

auto device_type = DeviceType::kCPU;
if (x->is_global()) {
device_type = JUST(x->parallel_desc())->device_type();
} else {
device_type = JUST(x->device())->enum_type();
}

if (device_type == DeviceType::kNPU) {
// NOTE: use cpu argwhere when device="npu"
auto cpu_tensor = JUST(one::functional::To(x, "cpu"));
auto result = JUST(OpInterpUtil::Dispatch<TensorTuple>(*op_, {cpu_tensor}, attrs));
for (int i = 0; i < result->size(); ++i) {
(*result)[i] = JUST(one::functional::To((*result)[i], "npu"));
}
return result;
} else {
return OpInterpUtil::Dispatch<TensorTuple>(*op_, {x}, attrs);
}
}

private:
Expand Down
27 changes: 24 additions & 3 deletions oneflow/core/functional/impl/nn_functor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4014,17 +4014,38 @@ class PariticalFCSampleDisableBoxing {

class NmsFunctor {
public:
NmsFunctor() { op_ = CHECK_JUST(one::OpBuilder("nms").Input("in").Output("out").Build()); }
NmsFunctor() {
op_ = CHECK_JUST(one::OpBuilder("nms").Input("in").Output("out").Build());
fused_op_ = CHECK_JUST(one::OpBuilder("nms")
.Input("in")
.Input("scores")
.Input("input_indices")
.Output("out")
.Build());
}

Maybe<Tensor> operator()(const std::shared_ptr<one::Tensor>& x, const float& iou_threshold,
Maybe<Tensor> operator()(const std::shared_ptr<one::Tensor>& x,
const Optional<one::Tensor>& scores,
const Optional<one::Tensor>& input_indices, const float& iou_threshold,
const int32_t& keep_n) const {
auto& attrs = THREAD_CACHED_MUTABLE_ATTR_MAP("iou_threshold", "keep_n");
attrs.SetAllAttrs(iou_threshold, keep_n);
return OpInterpUtil::Dispatch<Tensor>(*op_, {x}, attrs);
DeviceType device_type = JUST(x->device())->enum_type();
if (device_type == DeviceType::kNPU) {
if (scores) {
Copy link

Copilot AI Jun 1, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The fused NMS path checks only scores; it should verify both scores and input_indices are provided to avoid passing a null optional downstream.

Suggested change
if (scores) {
if (scores && input_indices) {

Copilot uses AI. Check for mistakes.
return OpInterpUtil::Dispatch<Tensor>(*fused_op_, {x, JUST(scores), JUST(input_indices)},
attrs);
} else {
return OpInterpUtil::Dispatch<Tensor>(*op_, {x}, attrs);
}
} else {
return OpInterpUtil::Dispatch<Tensor>(*op_, {x}, attrs);
}
}

private:
std::shared_ptr<OpExpr> op_;
std::shared_ptr<OpExpr> fused_op_;
};

class RoiAlignFunctor {
Expand Down
4 changes: 3 additions & 1 deletion oneflow/ir/include/OneFlow/OneFlowUserOps.td
Original file line number Diff line number Diff line change
Expand Up @@ -1886,7 +1886,9 @@ def OneFlow_InTopKOp : OneFlow_BaseOp<"in_top_k", [NoMemoryEffect, NoGrad, Decla

def OneFlow_NmsOp : OneFlow_BaseOp<"nms", [NoMemoryEffect, DeclareOpInterfaceMethods<UserOpCompatibleInterface>]> {
let input = (ins
OneFlow_Tensor:$in
OneFlow_Tensor:$in,
Optional<OneFlow_Tensor>:$scores,
Optional<OneFlow_Tensor>:$input_indices
);
let output = (outs
OneFlow_Tensor:$out
Expand Down
6 changes: 5 additions & 1 deletion oneflow/user/ops/nms_op.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,11 @@ Maybe<void> InferNmsTensorDesc(user_op::InferContext* ctx) {
}

Maybe<void> InferNmsDataType(user_op::InferContext* ctx) {
ctx->SetOutputDType("out", 0, DataType::kInt8);
if (ctx->parallel_desc().device_type() == DeviceType::kNPU) {
ctx->SetOutputDType("out", 0, DataType::kInt32);
} else {
ctx->SetOutputDType("out", 0, DataType::kInt8);
}
return Maybe<void>::Ok();
}

Expand Down
10 changes: 8 additions & 2 deletions python/oneflow/nn/modules/nms.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,13 @@

def nms_op(boxes, scores, iou_threshold: float):
score_inds = flow.argsort(scores, dim=0, descending=True)
boxes = flow._C.gather(boxes, score_inds, axis=0)
keep = flow._C.nms(boxes, iou_threshold)
if boxes.device == flow.device("npu"):
sorted_scores = flow.gather(scores, dim=0, index=score_inds)
keep = flow._C.nms(
boxes, sorted_scores, score_inds.to(flow.int32), iou_threshold=iou_threshold
)
else:
boxes = flow._C.gather(boxes, score_inds, axis=0)
keep = flow._C.nms(boxes, iou_threshold=iou_threshold)
index = flow.squeeze(flow.argwhere(keep), dim=[1])
return flow._C.gather(score_inds, index, axis=0)
Loading