Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 34 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,7 @@ Use keyboard shortcuts in mprocs: `s` to start, `x` to stop, `r` to restart, `q`
| `wizard-tail-run` | Tail the wizard's verbose output (`/tmp/posthog-wizard.log`) |
| `wizard-ci-run` | Full CI flow: run wizard, create PR, evaluate |
| `wizard-ci-local-run` | CI flow with local evaluation (no PR) |
| `wizard-ci-benchmark` | Benchmark only: run wizard with per-phase tracking (no evaluation) |
| `wizard-ci-create-pr` | Push branch and create PR only (skip wizard run) |
| `wizard-ci-evaluate-pr` | Evaluate an existing PR or local branch |

Expand Down Expand Up @@ -160,4 +161,36 @@ You can activate `wizard-ci.yml` in a few ways:

1. **Manual** - Run from GitHub Actions UI
2. **Schedule** - Runs on cron
3. **Dispatch** - Webhook call via `repository_dispatch` with event type `wizard-ci-trigger`
3. **Dispatch** - Webhook call via `repository_dispatch` with event type `wizard-ci-trigger`

---

## Benchmarking

Wizard CI runs automatically collect per-phase token usage, cost, and timing data. The wizard's `--benchmark` flag is always enabled in CI mode, breaking execution into separate agent calls per workflow phase (setup, 1.0-begin, 1.1-edit, 1.2-revise, 1.3-conclude).

### Running a benchmark

In mprocs, start **`wizard-ci-benchmark`** (press `s`), pick your test app, and the benchmark table prints after the wizard completes:

```
┌─────────────┬──────────┬──────────┬───────────┬───────┬─────────┐
│ Phase │ Input │ Output │ Cost │ Turns │ Time │
├─────────────┼──────────┼──────────┼───────────┼───────┼─────────┤
│ Setup │ 1,234 │ 567 │ $0.0234 │ 5 │ 42.3s │
│ 1.0-begin │ 12,345 │ 2,345 │ $0.1234 │ 12 │ 123.4s │
│ 1.1-edit │ 34,567 │ 8,901 │ $0.3456 │ 25 │ 234.5s │
│ 1.2-revise │ 8,901 │ 1,234 │ $0.0890 │ 8 │ 67.8s │
│ 1.3-conclude│ 5,678 │ 2,345 │ $0.0567 │ 10 │ 89.0s │
├─────────────┼──────────┼──────────┼───────────┼───────┼─────────┤
│ TOTAL │ 62,725 │ 15,392 │ $0.6381 │ 60 │ 557.0s │
└─────────────┴──────────┴──────────┴───────────┴───────┴─────────┘
```

Benchmark data is also saved to `test-evaluations/<run-name>/benchmark.json`.

Use **`wizard-ci-run`** instead if you want the full flow with a GitHub PR — the benchmark table is included in the PR body as markdown.

### Raw data

The wizard writes `/tmp/posthog-wizard-benchmark.json` with per-step token counts, cost, duration, and model usage. The CI runner reads and cleans up this file automatically.
6 changes: 6 additions & 0 deletions mprocs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,12 @@ procs:
autostart: false
env_file: .env

wizard-ci-benchmark:
# Runs wizard with benchmark mode only (no evaluation)
shell: "pnpm wizard-ci --local"
autostart: false
env_file: .env

wizard-ci-create-pr:
# Only pushes branch and creates PR on GH
shell: "pnpm wizard-ci --push-only --branch"
Expand Down
229 changes: 229 additions & 0 deletions services/wizard-ci/benchmark.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,229 @@
/**
* Benchmark data types and formatting for wizard CI runs.
* Reads per-phase token usage data written by the wizard's --benchmark mode.
*/
import { readFileSync, unlinkSync, existsSync } from "fs";

export const BENCHMARK_FILE_PATH = "/tmp/posthog-wizard-benchmark.json";

export interface StepUsage {
name: string;
usage: {
input_tokens: number;
output_tokens: number;
cache_creation_input_tokens: number;
cache_read_input_tokens: number;
};
modelUsage: Record<string, unknown>;
totalCostUsd: number;
durationMs: number;
durationApiMs: number;
numTurns: number;
contextTokensIn?: number;
contextTokensOut?: number;
compactions?: number;
compactionPreTokens?: number[];
}

export interface BenchmarkData {
timestamp: string;
steps: StepUsage[];
totals: {
totalCostUsd: number;
durationMs: number;
inputTokens: number;
outputTokens: number;
numTurns: number;
};
}

/**
* Read and parse the benchmark file written by the wizard.
* Returns null if file doesn't exist or can't be parsed.
* Optionally cleans up the temp file after reading.
*/
export function readBenchmarkFile(cleanup = true): BenchmarkData | null {
try {
if (!existsSync(BENCHMARK_FILE_PATH)) {
return null;
}
const raw = readFileSync(BENCHMARK_FILE_PATH, "utf-8");
const data = JSON.parse(raw) as BenchmarkData;

if (cleanup) {
try {
unlinkSync(BENCHMARK_FILE_PATH);
} catch {
// Ignore cleanup errors
}
}

return data;
} catch {
return null;
}
}

// ============================================================================
// Formatting helpers
// ============================================================================

function formatNumber(n: number): string {
return n.toLocaleString("en-US");
}

function formatCost(usd: number): string {
return `$${usd.toFixed(4)}`;
}

function formatDuration(ms: number): string {
return `${(ms / 1000).toFixed(1)}s`;
}

function formatContext(tokens: number | undefined): string {
if (tokens == null) return "-";
if (tokens >= 1_000_000) return `${(tokens / 1_000_000).toFixed(1)}M`;
if (tokens >= 10_000) return `${Math.round(tokens / 1000)}K`;
return formatNumber(tokens);
}

function padRight(str: string, len: number): string {
return str.length >= len ? str : str + " ".repeat(len - str.length);
}

function padLeft(str: string, len: number): string {
return str.length >= len ? str : " ".repeat(len - str.length) + str;
}

/**
* Format benchmark data as a console table.
*/
export function formatBenchmarkConsole(data: BenchmarkData): string {
const hasContext = data.steps.some((s) => s.contextTokensOut != null);
const COL = { phase: 13, input: 10, output: 10, cost: 11, turns: 7, time: 9, ctxIn: 9, ctxOut: 9 };

const cols = [COL.phase, COL.input, COL.output, COL.cost, COL.turns, COL.time];
const headers = [" Phase", " Input", " Output", " Cost", " Turns", " Time"];
if (hasContext) {
cols.push(COL.ctxIn, COL.ctxOut);
headers.push(" Ctx In", " Ctx Out");
}

const line = (left: string, mid: string, right: string, fill: string) =>
left + cols.map((w) => fill.repeat(w)).join(mid) + right;

const header =
line("\u250c", "\u252c", "\u2510", "\u2500") +
"\n" +
"\u2502" + headers.map((h, i) => padRight(h, cols[i])).join("\u2502") + "\u2502" +
"\n" +
line("\u251c", "\u253c", "\u2524", "\u2500");

const rows = data.steps.map((step) => {
const totalInput = step.usage.input_tokens + step.usage.cache_read_input_tokens + step.usage.cache_creation_input_tokens;
const cells = [
padRight(` ${step.name}`, COL.phase),
padLeft(formatNumber(totalInput), COL.input - 1) + " ",
padLeft(formatNumber(step.usage.output_tokens), COL.output - 1) + " ",
padLeft(formatCost(step.totalCostUsd), COL.cost - 1) + " ",
padLeft(String(step.numTurns), COL.turns - 1) + " ",
padLeft(formatDuration(step.durationMs), COL.time - 1) + " ",
];
if (hasContext) {
cells.push(
padLeft(formatContext(step.contextTokensIn), COL.ctxIn - 1) + " ",
padLeft(formatContext(step.contextTokensOut), COL.ctxOut - 1) + " ",
);
}
return "\u2502" + cells.join("\u2502") + "\u2502";
});

const separator = line("\u251c", "\u253c", "\u2524", "\u2500");

const totalCells = [
padRight(" TOTAL", COL.phase),
padLeft(formatNumber(data.totals.inputTokens), COL.input - 1) + " ",
padLeft(formatNumber(data.totals.outputTokens), COL.output - 1) + " ",
padLeft(formatCost(data.totals.totalCostUsd), COL.cost - 1) + " ",
padLeft(String(data.totals.numTurns), COL.turns - 1) + " ",
padLeft(formatDuration(data.totals.durationMs), COL.time - 1) + " ",
];
if (hasContext) {
const lastStep = data.steps[data.steps.length - 1];
totalCells.push(
padLeft("", COL.ctxIn - 1) + " ",
padLeft(formatContext(lastStep?.contextTokensOut), COL.ctxOut - 1) + " ",
);
}
const totalRow = "\u2502" + totalCells.join("\u2502") + "\u2502";

const footer = line("\u2514", "\u2534", "\u2518", "\u2500");

const parts = [header, ...rows, separator, totalRow, footer];

// Add compaction notes below the table if any occurred
const compactedSteps = data.steps.filter((s) => s.compactions && s.compactions > 0);
if (compactedSteps.length > 0) {
parts.push("");
const totalCompactions = compactedSteps.reduce((sum, s) => sum + (s.compactions ?? 0), 0);
parts.push(`\u26a0 ${totalCompactions} compaction(s) detected:`);
for (const step of compactedSteps) {
const preTokensStr = step.compactionPreTokens
? step.compactionPreTokens.map((t) => formatContext(t)).join(", ")
: "";
parts.push(` ${step.name}: ${step.compactions}x (pre-tokens: ${preTokensStr})`);
}
}

return parts.join("\n");
}

/**
* Format benchmark data as a markdown table for PR bodies.
*/
export function formatBenchmarkMarkdown(data: BenchmarkData): string {
const hasContext = data.steps.some((s) => s.contextTokensOut != null);
const ctxHeaders = hasContext ? " Ctx In | Ctx Out |" : "";
const ctxAlign = hasContext ? "------:|-------:|" : "";

const lines = [
"## Benchmark",
"",
`| Phase | Input | Output | Cost | Turns | Time |${ctxHeaders}`,
`|-------|------:|-------:|-----:|------:|-----:|${ctxAlign}`,
];

for (const step of data.steps) {
const totalInput = step.usage.input_tokens + step.usage.cache_read_input_tokens + step.usage.cache_creation_input_tokens;
const ctxCols = hasContext
? ` ${formatContext(step.contextTokensIn)} | ${formatContext(step.contextTokensOut)} |`
: "";
lines.push(
`| ${step.name} | ${formatNumber(totalInput)} | ${formatNumber(step.usage.output_tokens)} | ${formatCost(step.totalCostUsd)} | ${step.numTurns} | ${formatDuration(step.durationMs)} |${ctxCols}`,
);
}

const lastStep = data.steps[data.steps.length - 1];
const ctxTotalCols = hasContext
? ` | **${formatContext(lastStep?.contextTokensOut)}** |`
: "";
lines.push(
`| **TOTAL** | **${formatNumber(data.totals.inputTokens)}** | **${formatNumber(data.totals.outputTokens)}** | **${formatCost(data.totals.totalCostUsd)}** | **${data.totals.numTurns}** | **${formatDuration(data.totals.durationMs)}** |${ctxTotalCols}`,
);

// Add compaction notes if any occurred
const compactedSteps = data.steps.filter((s) => s.compactions && s.compactions > 0);
if (compactedSteps.length > 0) {
const totalCompactions = compactedSteps.reduce((sum, s) => sum + (s.compactions ?? 0), 0);
lines.push("");
lines.push(`> **${totalCompactions} compaction(s)** detected during run:`);
for (const step of compactedSteps) {
const preTokensStr = step.compactionPreTokens
? step.compactionPreTokens.map((t) => formatContext(t)).join(", ")
: "";
lines.push(`> - **${step.name}**: ${step.compactions}x (pre-tokens: ${preTokensStr})`);
}
}

return lines.join("\n");
}
33 changes: 33 additions & 0 deletions services/wizard-ci/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,9 @@
*/
import "dotenv/config";
import { createInterface } from "readline";
import { writeFileSync, mkdirSync } from "fs";
import { join, relative } from "path";
import { formatBenchmarkConsole, formatBenchmarkMarkdown, type BenchmarkData } from "./benchmark.js";
import {
findApps,
resetApp,
Expand Down Expand Up @@ -71,6 +73,7 @@ interface PRMetadata {
posthogRef?: string;
source?: string;
sourceUrl?: string;
benchmarkMarkdown?: string;
}

function getDependencyRefs(): Pick<PRMetadata, "wizardRef" | "contextMillRef" | "posthogRef"> {
Expand Down Expand Up @@ -125,6 +128,9 @@ function buildPRBody(meta: PRMetadata): string {
if (meta.duration !== undefined) {
lines.push(`Duration: ${formatMs(meta.duration)}`);
}
if (meta.benchmarkMarkdown) {
lines.push("", meta.benchmarkMarkdown);
}
return lines.join("\n");
}

Expand Down Expand Up @@ -523,6 +529,11 @@ async function runCI(app: App, opts: Options, triggerId: string): Promise<boolea
}
console.log(` Completed in ${formatMs(result.duration)}\n`);

// Log benchmark data if available
if (result.benchmark) {
console.log("\n" + formatBenchmarkConsole(result.benchmark) + "\n");
}

// 3. Check changes in app directory only
console.log("[3/5] Checking changes...");
if (!hasChangesInPath(repoRoot, appRelativePath)) {
Expand Down Expand Up @@ -562,6 +573,11 @@ async function runCI(app: App, opts: Options, triggerId: string): Promise<boolea
const testRunName = `local-${triggerId}-${app.name.replace(/\//g, "-")}`;
const evalInfo = await runLocalEvaluation(branchName, opts.base, testRunName);

// Save benchmark data to evaluation directory if available
if (result.benchmark && testRunName) {
saveBenchmarkToEvalDir(testRunName, result.benchmark);
}

// Return to original branch
checkout(repoRoot, originalBranch);

Expand Down Expand Up @@ -623,6 +639,7 @@ async function runCI(app: App, opts: Options, triggerId: string): Promise<boolea
shortId: triggerId,
branch: branchName,
duration: result.duration,
benchmarkMarkdown: result.benchmark ? formatBenchmarkMarkdown(result.benchmark) : undefined,
...getDependencyRefs(),
...getSourceInfo(),
};
Expand Down Expand Up @@ -674,6 +691,22 @@ async function runCI(app: App, opts: Options, triggerId: string): Promise<boolea
return true;
}

// ============================================================================
// Benchmark helpers
// ============================================================================

function saveBenchmarkToEvalDir(testRunName: string, benchmark: BenchmarkData): void {
try {
const evalDir = join(process.cwd(), "test-evaluations", testRunName);
mkdirSync(evalDir, { recursive: true });
const benchmarkPath = join(evalDir, "benchmark.json");
writeFileSync(benchmarkPath, JSON.stringify(benchmark, null, 2));
console.log(` Benchmark saved: ${benchmarkPath}\n`);
} catch (e) {
console.warn(` Failed to save benchmark data: ${e}\n`);
}
}

// ============================================================================
// Main
// ============================================================================
Expand Down
Loading