Skip to content

Benchmark

Benchmark #15

Workflow file for this run

name: Benchmark
on:
# Run on schedule (weekly on Sunday at midnight)
schedule:
- cron: "0 0 * * 0"
# Run on manual trigger
workflow_dispatch:
inputs:
dataset_variant:
description: "Dataset variant (s=small, m=medium, oracle)"
required: false
default: "oracle"
type: choice
options:
- oracle
- s
- m
limit:
description: "Limit number of instances (0 = no limit)"
required: false
default: "10"
type: string
compare_baseline:
description: "Compare against baseline"
required: false
default: true
type: boolean
# Run on PRs that modify benchmark code
pull_request:
branches: [main]
paths:
- "packages/benchmark/**"
- ".github/workflows/benchmark.yml"
jobs:
benchmark:
runs-on: ubuntu-latest
permissions:
contents: read
pull-requests: write
steps:
- name: Checkout
uses: actions/checkout@v6
- name: Setup Bun
uses: oven-sh/setup-bun@v2
with:
bun-version: "1.3.5"
- name: Cache node_modules
uses: actions/cache@v5
id: node-modules
with:
path: node_modules
key: bun-modules-${{ hashFiles('bun.lock') }}
- name: Install dependencies
run: bun install --frozen-lockfile
if: steps.node-modules.outputs.cache-hit != 'true'
- name: Build packages
run: bun run build
- name: Download LongMemEval dataset
run: |
mkdir -p packages/benchmark/data
cd packages/benchmark/data
# Download oracle dataset (smallest, for quick validation)
if [ ! -f "longmemeval_oracle.json" ]; then
echo "Downloading LongMemEval oracle dataset..."
curl -L -o longmemeval_oracle.json \
"https://huggingface.co/datasets/xiaowu0162/longmemeval-cleaned/resolve/main/longmemeval_oracle.json"
fi
# Download small dataset if requested
VARIANT="${{ inputs.dataset_variant || 'oracle' }}"
if [ "$VARIANT" = "s" ] && [ ! -f "longmemeval_s_cleaned.json" ]; then
echo "Downloading LongMemEval small dataset..."
curl -L -o longmemeval_s_cleaned.json \
"https://huggingface.co/datasets/xiaowu0162/longmemeval-cleaned/resolve/main/longmemeval_s_cleaned.json"
fi
- name: Validate dataset
run: |
cd packages/benchmark
VARIANT="${{ inputs.dataset_variant || 'oracle' }}"
if [ "$VARIANT" = "oracle" ]; then
bunx tsx src/cli/index.ts validate data/longmemeval_oracle.json
elif [ "$VARIANT" = "s" ]; then
bunx tsx src/cli/index.ts validate data/longmemeval_s_cleaned.json
fi
- name: Run benchmark (stub providers)
id: benchmark
run: |
cd packages/benchmark
mkdir -p results
VARIANT="${{ inputs.dataset_variant || 'oracle' }}"
LIMIT="${{ inputs.limit || '10' }}"
DATASET_FILE="data/longmemeval_oracle.json"
if [ "$VARIANT" = "s" ]; then
DATASET_FILE="data/longmemeval_s_cleaned.json"
fi
# Run benchmark with stub providers (for CI testing)
LIMIT_ARG=""
if [ "$LIMIT" != "0" ]; then
LIMIT_ARG="--limit $LIMIT"
fi
bunx tsx src/cli/index.ts run longmemeval \
--dataset "$DATASET_FILE" \
--variant "$VARIANT" \
--output results/benchmark-results.jsonl \
--embeddings stub \
--llm stub \
--chain-of-note \
--key-expansion \
--temporal-analysis \
--verbose \
$LIMIT_ARG 2>&1 | tee results/benchmark-output.txt
# Extract metrics from output for summary
echo "## Benchmark Results" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY
echo "**Dataset:** $VARIANT" >> $GITHUB_STEP_SUMMARY
echo "**Instances:** $LIMIT" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY
echo '```' >> $GITHUB_STEP_SUMMARY
tail -50 results/benchmark-output.txt >> $GITHUB_STEP_SUMMARY
echo '```' >> $GITHUB_STEP_SUMMARY
- name: Save metrics JSON
run: |
cd packages/benchmark
# Create a structured metrics file
bun -e "
const fs = require('fs');
const output = fs.readFileSync('results/benchmark-output.txt', 'utf8');
// Parse accuracy from output
const accMatch = output.match(/Accuracy:\s*([\d.]+)%/);
const accuracy = accMatch ? parseFloat(accMatch[1]) : 0;
const metrics = {
timestamp: new Date().toISOString(),
variant: '${{ inputs.dataset_variant || 'oracle' }}',
limit: parseInt('${{ inputs.limit || '10' }}') || null,
accuracy: accuracy,
commit: '${{ github.sha }}',
branch: '${{ github.ref_name }}',
workflow_run: '${{ github.run_id }}'
};
fs.writeFileSync('results/metrics.json', JSON.stringify(metrics, null, 2));
console.log('Metrics:', JSON.stringify(metrics, null, 2));
"
- name: Upload benchmark results
uses: actions/upload-artifact@v6
with:
name: benchmark-results-${{ github.run_id }}
path: |
packages/benchmark/results/benchmark-results.jsonl
packages/benchmark/results/benchmark-output.txt
packages/benchmark/results/metrics.json
retention-days: 90
- name: Compare with baseline (PR only)
if: github.event_name == 'pull_request' && inputs.compare_baseline != false
uses: actions/github-script@v8
with:
script: |
const fs = require('fs');
// Read current metrics
const metricsPath = 'packages/benchmark/results/metrics.json';
const metrics = JSON.parse(fs.readFileSync(metricsPath, 'utf8'));
// Baseline (stub providers typically get 60-70% with string matching)
const baseline = {
accuracy: 60.0,
threshold: 5.0 // Alert if drop > 5%
};
const diff = metrics.accuracy - baseline.accuracy;
const status = diff >= -baseline.threshold ? '✅' : '⚠️';
const body = `## Benchmark Results ${status}
| Metric | Value | Baseline | Diff |
|:-------|------:|---------:|-----:|
| Accuracy | ${metrics.accuracy.toFixed(1)}% | ${baseline.accuracy.toFixed(1)}% | ${diff >= 0 ? '+' : ''}${diff.toFixed(1)}% |
**Dataset:** ${metrics.variant}
**Commit:** \`${metrics.commit.substring(0, 7)}\`
${diff < -baseline.threshold ? '⚠️ **Warning:** Accuracy dropped below threshold!' : ''}
<details>
<summary>Details</summary>
- Workflow run: ${metrics.workflow_run}
- Timestamp: ${metrics.timestamp}
</details>
`;
// Post or update comment
const { data: comments } = await github.rest.issues.listComments({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: context.issue.number
});
const botComment = comments.find(c =>
c.user.type === 'Bot' && c.body.includes('## Benchmark Results')
);
if (botComment) {
await github.rest.issues.updateComment({
owner: context.repo.owner,
repo: context.repo.repo,
comment_id: botComment.id,
body
});
} else {
await github.rest.issues.createComment({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: context.issue.number,
body
});
}
# Historical tracking job (only on main branch)
track-history:
needs: benchmark
if: github.ref == 'refs/heads/main' && github.event_name != 'pull_request'
runs-on: ubuntu-latest
permissions:
contents: write
steps:
- name: Checkout
uses: actions/checkout@v6
- name: Download current results
uses: actions/download-artifact@v6
with:
name: benchmark-results-${{ github.run_id }}
path: current-results
- name: Update historical data
run: |
mkdir -p .github/benchmark-history
# Append to history file
HISTORY_FILE=".github/benchmark-history/history.jsonl"
if [ -f "current-results/metrics.json" ]; then
cat current-results/metrics.json >> "$HISTORY_FILE"
echo "" >> "$HISTORY_FILE"
fi
# Keep only last 100 entries
if [ -f "$HISTORY_FILE" ]; then
tail -100 "$HISTORY_FILE" > "$HISTORY_FILE.tmp"
mv "$HISTORY_FILE.tmp" "$HISTORY_FILE"
fi
- name: Generate badge data
run: |
mkdir -p .github/benchmark-history
# Read latest accuracy using bun
ACCURACY=$(bun -e "
const fs = require('fs');
const metrics = JSON.parse(fs.readFileSync('current-results/metrics.json', 'utf8'));
console.log(metrics.accuracy.toFixed(1));
")
# Generate badge JSON for shields.io
COLOR="red"
if (( $(echo "$ACCURACY >= 80" | bc -l) )); then
COLOR="brightgreen"
elif (( $(echo "$ACCURACY >= 70" | bc -l) )); then
COLOR="green"
elif (( $(echo "$ACCURACY >= 60" | bc -l) )); then
COLOR="yellow"
elif (( $(echo "$ACCURACY >= 50" | bc -l) )); then
COLOR="orange"
fi
cat > .github/benchmark-history/badge.json << EOF
{
"schemaVersion": 1,
"label": "LongMemEval",
"message": "${ACCURACY}%",
"color": "$COLOR"
}
EOF
- name: Commit history update
run: |
git config user.name "github-actions[bot]"
git config user.email "github-actions[bot]@users.noreply.github.com"
git add .github/benchmark-history/
git diff --staged --quiet || git commit -m "chore: update benchmark history [skip ci]"
git push || echo "Nothing to push"