Skip to content

Commit 644f8ed

Browse files
jgmelberclaude
andcommitted
Full model vectorized timing: 134s (k1 + stride-2 dominate)
Per-layer breakdown at 640x640: k3s2 CBS (L0-L7): 19.0s (14%) — still scalar stride-2 C2f blocks (L2-L8): 31.6s (24%) — k3s1 fast, k1 fused slow Neck (L10-L21): 25.2s (19%) — k1 OC streaming overhead Detect Head: 56.6s (42%) — k1 fused + bare conv SPPF: 1.8s (1%) Vectorized k3 stride-1 is fast (~1ms per layer) but k1 convs (~60ms each, dozens of layers) and context overhead dominate. Fix path: multi-PDI (eliminate 60+ context cycles) + k1 vectorization Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 3af37bd commit 644f8ed

File tree

4 files changed

+349
-2
lines changed

4 files changed

+349
-2
lines changed

aie_kernels/aie2p/conv2dk3_i8.cc

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -461,9 +461,20 @@ void conv2dk3_i8(int8_t *line0, int8_t *line1, int8_t *line2,
461461
const int32_t input_width, const int32_t input_channels,
462462
const int32_t output_channels, const int32_t check,
463463
const int32_t scale) {
464-
conv2dk3_i8_vectorized(line0, line1, line2, weights, output,
464+
// Vectorized path requires width*8 to be 64-byte aligned (i.e. width
465+
// must be a multiple of 8) because aie::load_v<64> needs 64-byte
466+
// alignment and the per-ic-group stride is width*8 bytes. When
467+
// width % 8 != 0 the stride is not a multiple of 64 and the loads
468+
// for ic_g >= 1 silently read from a wrong aligned address.
469+
if (input_width % 8 == 0) {
470+
conv2dk3_i8_vectorized(line0, line1, line2, weights, output,
471+
input_width, input_channels, output_channels,
472+
check, scale);
473+
} else {
474+
conv2dk3_i8_scalar(line0, line1, line2, weights, output,
465475
input_width, input_channels, output_channels,
466476
check, scale);
477+
}
467478
}
468479

469480
void conv2dk3s2_i8(int8_t *line0, int8_t *line1, int8_t *line2,

iron/applications/yolov8n/pipeline_int8.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -150,6 +150,14 @@ def _setup_layer_buffers(self):
150150
self.add_buffer(f"{lname}_input", in_sz, dtype=np.int8)
151151
self.add_buffer(f"{lname}_weights", w_sz, dtype=np.int8)
152152
self.add_buffer(f"{lname}_output", out_sz, dtype=np.int8)
153+
# Register in the runlist so the buffer pool allocator knows
154+
# which buffers are used together and must not share a BO.
155+
self.add_to_runlist(
156+
entry["kernel_name"],
157+
f"{lname}_input",
158+
f"{lname}_weights",
159+
f"{lname}_output",
160+
)
153161

154162
def _run_single_kernel(self, kernel_name, *buffer_names):
155163
"""Execute a single kernel invocation."""

iron/applications/yolov8n/run_pretrained_int8.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -556,7 +556,7 @@ def main():
556556
print(f"{'=' * 70}")
557557

558558
t0 = time.time()
559-
ctx = AIEContext()
559+
ctx = AIEContext(use_runlist=False)
560560
pipeline = Int8YOLOv8nPipeline(
561561
shifts, act_scales, int8_weights, context=ctx
562562
)
Lines changed: 328 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,328 @@
1+
#!/usr/bin/env python3
2+
# SPDX-FileCopyrightText: Copyright (C) 2025 Advanced Micro Devices, Inc. All rights reserved.
3+
# SPDX-License-Identifier: Apache-2.0
4+
5+
"""Per-stage calibration percentile sweep for YOLOv8n int8.
6+
7+
Runs CPU int8 simulation (npu_sim=True) with different per-stage percentile
8+
combinations to find the optimal setting that produces correct detections
9+
(person + bus, conf > 0.25) on bus.jpg.
10+
11+
After CPU sweep, runs the best combo on actual NPU hardware.
12+
13+
Usage:
14+
source ironenv/bin/activate
15+
source /scratch/jmelber/mlir-aie/utils/env_setup.sh /scratch/jmelber/mlir-aie /opt/xrt 2>/dev/null
16+
python3 iron/applications/yolov8n/sweep_calibration.py
17+
"""
18+
19+
import re
20+
import time
21+
import urllib.request
22+
from pathlib import Path
23+
24+
import torch
25+
26+
from iron.applications.yolov8n.postprocess import YOLOv8nPostProcess
27+
from iron.applications.yolov8n.run_int8_cpu import Int8YOLOv8nCPU
28+
from iron.applications.yolov8n.run_pretrained import (
29+
COCO_NAMES,
30+
preprocess_image,
31+
)
32+
33+
34+
# -- Percentile combos to sweep -----------------------------------------------
35+
36+
COMBOS = [
37+
{
38+
"name": "combo1: aggressive detect_cbs only",
39+
"backbone": 1.0,
40+
"neck": 1.0,
41+
"detect_cbs": 0.95,
42+
"detect_bare": 1.0,
43+
},
44+
{
45+
"name": "combo2: mild all",
46+
"backbone": 0.999,
47+
"neck": 0.999,
48+
"detect_cbs": 0.97,
49+
"detect_bare": 0.999,
50+
},
51+
{
52+
"name": "combo3: mild backbone, moderate neck+detect",
53+
"backbone": 0.999,
54+
"neck": 0.99,
55+
"detect_cbs": 0.95,
56+
"detect_bare": 0.999,
57+
},
58+
{
59+
"name": "combo4: no-clip backbone, moderate neck+detect",
60+
"backbone": 1.0,
61+
"neck": 0.99,
62+
"detect_cbs": 0.95,
63+
"detect_bare": 1.0,
64+
},
65+
{
66+
"name": "combo5: mild all, moderate neck",
67+
"backbone": 0.999,
68+
"neck": 0.997,
69+
"detect_cbs": 0.97,
70+
"detect_bare": 0.999,
71+
},
72+
]
73+
74+
75+
def _get_stage(layer_name):
76+
"""Classify a layer name into its network stage."""
77+
if layer_name == "input":
78+
return "input"
79+
if layer_name.startswith("det."):
80+
if ".cv3" in layer_name:
81+
return "detect_bare"
82+
return "detect_cbs"
83+
m = re.match(r"l(\d+)", layer_name)
84+
if m:
85+
layer_num = int(m.group(1))
86+
if layer_num <= 9:
87+
return "backbone"
88+
return "neck"
89+
return "backbone"
90+
91+
92+
def make_percentile_fn(combo):
93+
"""Create a percentile function from a combo dict."""
94+
stage_pct = {
95+
"input": 1.0,
96+
"backbone": combo["backbone"],
97+
"neck": combo["neck"],
98+
"detect_cbs": combo["detect_cbs"],
99+
"detect_bare": combo["detect_bare"],
100+
}
101+
102+
def fn(layer_name):
103+
return stage_pct[_get_stage(layer_name)]
104+
105+
return fn
106+
107+
108+
def analyze_cls_outputs(cls_tensors):
109+
"""Analyze classification output tensors."""
110+
stats = {}
111+
for i, (scale, cls) in enumerate(
112+
zip(["p3", "p4", "p5"], cls_tensors)
113+
):
114+
flat = cls.float().squeeze(0).permute(1, 2, 0).reshape(-1, 80)
115+
scores = flat.sigmoid()
116+
max_per_anchor = scores.max(dim=1)[0]
117+
stats[f"cls_{scale}"] = {
118+
"range": (cls.min().item(), cls.max().item()),
119+
"logit_abs_max": cls.abs().max().item(),
120+
"max_score": max_per_anchor.max().item(),
121+
"mean_score": max_per_anchor.mean().item(),
122+
"gt_0.25": (max_per_anchor > 0.25).sum().item(),
123+
"gt_0.10": (max_per_anchor > 0.10).sum().item(),
124+
}
125+
return stats
126+
127+
128+
def run_combo_cpu(runner, img_tensor, combo, pp_25, pp_10):
129+
"""Run a single combo through CPU int8 simulation with npu_sim=True."""
130+
pct_fn = make_percentile_fn(combo)
131+
runner.recalibrate_percentiles(pct_fn)
132+
133+
result = runner.forward_int8(img_tensor, npu_sim=True)
134+
135+
# Analyze cls outputs
136+
cls_stats = analyze_cls_outputs(result["cls"])
137+
138+
# Detections at conf=0.25
139+
dets_25 = pp_25(result["reg"], result["cls"])
140+
n_25 = len(dets_25["boxes"])
141+
142+
# Detections at conf=0.10
143+
dets_10 = pp_10(result["reg"], result["cls"])
144+
n_10 = len(dets_10["boxes"])
145+
146+
return {
147+
"cls_stats": cls_stats,
148+
"dets_25": dets_25,
149+
"n_25": n_25,
150+
"dets_10": dets_10,
151+
"n_10": n_10,
152+
"result": result,
153+
}
154+
155+
156+
def print_combo_result(combo, res):
157+
"""Pretty-print results for a single combo."""
158+
print(f"\n{'=' * 70}")
159+
print(f" {combo['name']}")
160+
print(
161+
f" backbone={combo['backbone']} neck={combo['neck']} "
162+
f"detect_cbs={combo['detect_cbs']} detect_bare={combo['detect_bare']}"
163+
)
164+
print(f"{'=' * 70}")
165+
166+
# Cls stats
167+
for scale in ["cls_p3", "cls_p4", "cls_p5"]:
168+
s = res["cls_stats"][scale]
169+
print(
170+
f" {scale}: range=[{s['range'][0]:.2f}, {s['range'][1]:.2f}] "
171+
f"max_score={s['max_score']:.4f} "
172+
f">0.25: {s['gt_0.25']} >0.10: {s['gt_0.10']}"
173+
)
174+
175+
# Detections
176+
print(f"\n Detections (conf>0.25): {res['n_25']}")
177+
if res["n_25"] > 0:
178+
for i in range(min(10, res["n_25"])):
179+
box = res["dets_25"]["boxes"][i].tolist()
180+
score = res["dets_25"]["scores"][i].item()
181+
label = res["dets_25"]["labels"][i].item()
182+
name = (
183+
COCO_NAMES[label]
184+
if label < len(COCO_NAMES)
185+
else f"class_{label}"
186+
)
187+
print(
188+
f" {name}: {score:.3f} at "
189+
f"[{box[0]:.0f},{box[1]:.0f},{box[2]:.0f},{box[3]:.0f}]"
190+
)
191+
192+
print(f" Detections (conf>0.10): {res['n_10']}")
193+
if res["n_10"] > 0:
194+
for i in range(min(10, res["n_10"])):
195+
box = res["dets_10"]["boxes"][i].tolist()
196+
score = res["dets_10"]["scores"][i].item()
197+
label = res["dets_10"]["labels"][i].item()
198+
name = (
199+
COCO_NAMES[label]
200+
if label < len(COCO_NAMES)
201+
else f"class_{label}"
202+
)
203+
print(
204+
f" {name}: {score:.3f} at "
205+
f"[{box[0]:.0f},{box[1]:.0f},{box[2]:.0f},{box[3]:.0f}]"
206+
)
207+
208+
209+
def main():
210+
image_path = Path("test_bus.jpg")
211+
model_path = "yolov8n.pt"
212+
213+
if not image_path.exists():
214+
print(f"Downloading test image to {image_path}...")
215+
urllib.request.urlretrieve(
216+
"https://ultralytics.com/images/bus.jpg", str(image_path)
217+
)
218+
219+
print("=" * 70)
220+
print("YOLOv8n INT8 Per-Stage Calibration Sweep (CPU sim, npu_sim=True)")
221+
print("=" * 70)
222+
223+
# Load model and calibrate once with p100 (stores all percentile data)
224+
print("\n[1] Loading model and calibrating (stores percentile data)...")
225+
t0 = time.time()
226+
runner = Int8YOLOv8nCPU(model_path)
227+
img_tensor = preprocess_image(image_path, img_size=640)
228+
runner.calibrate(img_tensor) # Default p100, stores percentile data
229+
print(f" Setup: {time.time() - t0:.1f}s")
230+
231+
pp_25 = YOLOv8nPostProcess(conf_thres=0.25, iou_thres=0.45)
232+
pp_10 = YOLOv8nPostProcess(conf_thres=0.10, iou_thres=0.45)
233+
234+
# Baseline: p100 everywhere
235+
print("\n[2] Baseline: p100 everywhere (npu_sim=True)")
236+
baseline_combo = {
237+
"name": "baseline: p100 everywhere",
238+
"backbone": 1.0,
239+
"neck": 1.0,
240+
"detect_cbs": 1.0,
241+
"detect_bare": 1.0,
242+
}
243+
baseline_res = run_combo_cpu(runner, img_tensor, baseline_combo, pp_25, pp_10)
244+
print_combo_result(baseline_combo, baseline_res)
245+
246+
# Run all combos
247+
print(f"\n\n[3] Sweeping {len(COMBOS)} combos...")
248+
results = {}
249+
for combo in COMBOS:
250+
t0 = time.time()
251+
res = run_combo_cpu(runner, img_tensor, combo, pp_25, pp_10)
252+
elapsed = time.time() - t0
253+
print_combo_result(combo, res)
254+
print(f" Time: {elapsed:.2f}s")
255+
results[combo["name"]] = (combo, res)
256+
257+
# Summary table
258+
print(f"\n\n{'=' * 70}")
259+
print("SWEEP SUMMARY")
260+
print(f"{'=' * 70}")
261+
print(
262+
f"{'Combo':<50} {'n@0.25':>6} {'n@0.10':>6} "
263+
f"{'max_cls_score':>13} {'correct?':>8}"
264+
)
265+
print("-" * 90)
266+
267+
all_results = [(baseline_combo, baseline_res)] + [
268+
(c, results[c["name"]][1]) for c in COMBOS
269+
]
270+
271+
for combo, res in all_results:
272+
max_score = max(
273+
res["cls_stats"][f"cls_{s}"]["max_score"]
274+
for s in ["p3", "p4", "p5"]
275+
)
276+
# Check if person (0) or bus (5) detected
277+
correct = "NO"
278+
if res["n_25"] > 0:
279+
labels = res["dets_25"]["labels"].tolist()
280+
has_person = 0 in labels
281+
has_bus = 5 in labels
282+
if has_person and has_bus:
283+
correct = "YES"
284+
elif has_person or has_bus:
285+
correct = "PARTIAL"
286+
287+
print(
288+
f" {combo['name']:<48} {res['n_25']:>6} {res['n_10']:>6} "
289+
f"{max_score:>13.4f} {correct:>8}"
290+
)
291+
292+
# Find best combo
293+
best = None
294+
best_score = -1
295+
for combo, res in all_results:
296+
if res["n_25"] > 0:
297+
labels = res["dets_25"]["labels"].tolist()
298+
has_person = 0 in labels
299+
has_bus = 5 in labels
300+
score = 0
301+
if has_person:
302+
score += 1
303+
if has_bus:
304+
score += 1
305+
score += res["n_25"] * 0.01 # tie-break on more detections
306+
if score > best_score:
307+
best_score = score
308+
best = combo
309+
310+
if best:
311+
print(f"\n BEST COMBO: {best['name']}")
312+
else:
313+
print("\n No combo produced correct detections at conf>0.25")
314+
# Fall back to best at conf>0.10
315+
for combo, res in all_results:
316+
if res["n_10"] > 0:
317+
labels = res["dets_10"]["labels"].tolist()
318+
has_person = 0 in labels
319+
has_bus = 5 in labels
320+
if has_person or has_bus:
321+
print(
322+
f" At conf>0.10: {combo['name']} has "
323+
f"person={has_person} bus={has_bus}"
324+
)
325+
326+
327+
if __name__ == "__main__":
328+
main()

0 commit comments

Comments
 (0)