Skip to content

Commit 2f8ad5f

Browse files
authored
Merge pull request #70 from longieirl/fix/59-docker-integration-local
fix(#59): local Docker integration test with snapshot comparison
2 parents 9d24923 + db4ecaa commit 2f8ad5f

5 files changed

Lines changed: 175 additions & 8 deletions

File tree

.github/PULL_REQUEST_TEMPLATE.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
## Testing
2020
- [ ] Tests pass (coverage ≥ 91%)
2121
- [ ] Manually tested
22+
- [ ] `make docker-integration` passed locally *(required when touching `Dockerfile`, `entrypoint.sh`, `docker-compose.yml`, or `packages/parser-core/`)*
2223

2324
## Checklist
2425
- [ ] Code follows project style

.gitignore

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -249,3 +249,7 @@ tmp/
249249

250250
# GSD planning artifacts
251251
.planning/
252+
253+
# Integration test snapshot — personal to each developer's local input/ PDFs.
254+
# Never commit updates; the copy on main is kept only as a reference baseline.
255+
packages/parser-core/tests/integration/snapshots/output_snapshot.json

Makefile

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -360,7 +360,7 @@ show-retention-status: ## Show data retention status
360360
@python3 -c "from src.services.data_retention import DataRetentionService; from src.app import AppConfig; config = AppConfig.from_env(); service = DataRetentionService(config.data_retention_days, config.output_dir); files = service.find_expired_files(); print(f'Retention period: {config.data_retention_days} days'); print(f'Expired files: {len(files)}')"
361361

362362
# Docker build modes
363-
.PHONY: docker-local docker-remote docker-build docker-pull
363+
.PHONY: docker-local docker-remote docker-build docker-pull docker-integration
364364

365365
docker-local: ## Build and run from local code
366366
@echo "🔨 Building from local code..."
@@ -379,6 +379,24 @@ docker-build: ## Build local image without running
379379
@cp .env.local .env
380380
docker-compose build
381381

382+
docker-integration: ## Run Docker integration test against input/ — compares output to local snapshot
383+
@echo "🧪 Running Docker integration test..."
384+
@[ -d input ] && [ -n "$$(find input -name '*.pdf' 2>/dev/null | head -1)" ] || { echo "❌ No PDFs found in input/ — add statements first"; exit 1; }
385+
@mkdir -p /tmp/docker-integration-output
386+
@cp .env.local .env
387+
@docker-compose build -q
388+
@docker run --rm \
389+
-v "$$(pwd)/input:/app/input:ro" \
390+
-v "/tmp/docker-integration-output:/app/output" \
391+
-e EXIT_AFTER_PROCESSING=true \
392+
bankstatementsprocessor:latest
393+
@python3 packages/parser-core/tests/integration/docker_snapshot.py \
394+
/tmp/docker-integration-output \
395+
packages/parser-core/tests/integration/snapshots/output_snapshot.json \
396+
$$([ "$(UPDATE)" = "1" ] && echo "--update" || echo "")
397+
@rm -rf /tmp/docker-integration-output
398+
@echo "✅ Docker integration test passed"
399+
382400
docker-pull: ## Pull remote image without running
383401
@cp .env.remote .env
384402
docker-compose pull
Lines changed: 140 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,140 @@
1+
"""Docker integration snapshot helper.
2+
3+
Reads the container's output directory, builds the same snapshot structure
4+
as test_output_snapshot.py, then either updates the baseline or compares
5+
against it.
6+
7+
Usage:
8+
# Compare against existing snapshot:
9+
python3 docker_snapshot.py <output_dir> <snapshot_file>
10+
11+
# Update (or create) the snapshot baseline:
12+
python3 docker_snapshot.py <output_dir> <snapshot_file> --update
13+
"""
14+
15+
from __future__ import annotations
16+
17+
import json
18+
import sys
19+
from pathlib import Path
20+
21+
22+
def _build_snapshot(output_dir: Path) -> dict:
23+
"""Collect comparable metrics from the output directory.
24+
25+
Mirrors the logic in test_output_snapshot._build_snapshot().
26+
"""
27+
snapshot: dict = {"files": {}}
28+
29+
for path in sorted(output_dir.iterdir()):
30+
if path.name.startswith(".") or path.is_dir():
31+
continue
32+
33+
entry: dict = {"size_bytes": path.stat().st_size}
34+
35+
if path.suffix == ".json":
36+
try:
37+
data = json.loads(path.read_text(encoding="utf-8"))
38+
if isinstance(data, list):
39+
entry["record_count"] = len(data)
40+
elif isinstance(data, dict):
41+
entry["keys"] = sorted(data.keys())
42+
except json.JSONDecodeError:
43+
pass
44+
45+
if path.suffix == ".csv":
46+
lines = path.read_text(encoding="utf-8").splitlines()
47+
entry["row_count"] = len([line for line in lines if line.strip()]) - 1
48+
49+
snapshot["files"][path.name] = entry
50+
51+
csv_files = [
52+
k for k in snapshot["files"] if k.endswith(".csv") and "duplicate" not in k
53+
]
54+
snapshot["summary"] = {
55+
"total_files": len(snapshot["files"]),
56+
"csv_outputs": len(csv_files),
57+
"output_filenames": sorted(snapshot["files"].keys()),
58+
}
59+
60+
return snapshot
61+
62+
63+
def main() -> None:
64+
if len(sys.argv) < 3:
65+
print(
66+
"Usage: docker_snapshot.py <output_dir> <snapshot_file> [--update]",
67+
file=sys.stderr,
68+
)
69+
sys.exit(1)
70+
71+
output_dir = Path(sys.argv[1])
72+
snapshot_file = Path(sys.argv[2])
73+
update = "--update" in sys.argv
74+
75+
if not output_dir.exists():
76+
print(f"❌ Output directory not found: {output_dir}", file=sys.stderr)
77+
sys.exit(1)
78+
79+
current = _build_snapshot(output_dir)
80+
81+
if update:
82+
snapshot_file.parent.mkdir(parents=True, exist_ok=True)
83+
snapshot_file.write_text(
84+
json.dumps(current, indent=2, sort_keys=True), encoding="utf-8"
85+
)
86+
print(f"✅ Snapshot updated: {snapshot_file}")
87+
return
88+
89+
if not snapshot_file.exists():
90+
print(
91+
f"❌ No snapshot found at {snapshot_file}.\n"
92+
"Run with UPDATE=1 to create your baseline:\n"
93+
" make docker-integration UPDATE=1",
94+
file=sys.stderr,
95+
)
96+
sys.exit(1)
97+
98+
baseline = json.loads(snapshot_file.read_text(encoding="utf-8"))
99+
diffs = []
100+
101+
# Compare per-file metrics
102+
base_files = set(baseline.get("summary", {}).get("output_filenames", []))
103+
curr_files = set(current.get("summary", {}).get("output_filenames", []))
104+
for added in sorted(curr_files - base_files):
105+
diffs.append(f" new output file: {added}")
106+
for removed in sorted(base_files - curr_files):
107+
diffs.append(f" removed output file: {removed}")
108+
109+
for fname in sorted(base_files & curr_files):
110+
base_entry = baseline["files"].get(fname, {})
111+
curr_entry = current["files"].get(fname, {})
112+
for metric in ("row_count", "record_count"):
113+
bv = base_entry.get(metric)
114+
cv = curr_entry.get(metric)
115+
if bv is not None and bv != cv:
116+
diffs.append(f" {fname}.{metric}: {bv}{cv}")
117+
118+
if diffs:
119+
diff_text = "\n".join(diffs)
120+
print(
121+
f"❌ Snapshot mismatch — {len(diffs)} change(s) detected:\n"
122+
f"{diff_text}\n\n"
123+
"If intentional, re-run with UPDATE=1 to accept:\n"
124+
" make docker-integration UPDATE=1",
125+
file=sys.stderr,
126+
)
127+
sys.exit(1)
128+
129+
total = sum(
130+
e.get("record_count", e.get("row_count", 0))
131+
for e in current["files"].values()
132+
if "record_count" in e or "row_count" in e
133+
)
134+
print(
135+
f"✅ Snapshot matches baseline ({total} records across {len(curr_files)} files)"
136+
)
137+
138+
139+
if __name__ == "__main__":
140+
main()

packages/parser-core/tests/integration/test_output_snapshot.py

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,21 @@
11
"""Integration snapshot test for end-to-end output validation.
22
33
Runs the full processing pipeline against the real input/ directory and
4-
compares key output metrics against a committed snapshot baseline.
4+
compares key output metrics against a local snapshot baseline.
55
6-
Usage:
7-
# Run the integration test (skipped by default):
8-
pytest -m integration
6+
The snapshot is personal to each developer's machine and input PDFs —
7+
it is gitignored and never committed. Run with --snapshot-update once
8+
to create your baseline, then re-run as you make changes to catch
9+
regressions.
910
10-
# Update the snapshot baseline (first run or after intentional change):
11+
Usage:
12+
# Create or refresh your local snapshot baseline:
1113
pytest -m integration --snapshot-update
1214
13-
The snapshot file is committed to source control so changes are visible in
14-
code review. Input/output folders are gitignored and never committed.
15+
# Validate current output against your baseline:
16+
pytest -m integration
17+
18+
Input/output folders and the snapshot file are gitignored and never committed.
1519
"""
1620

1721
from __future__ import annotations

0 commit comments

Comments
 (0)