emnigma · mikhail-kharlamov · Nov 9, 2025 · Nov 9, 2025 · Nov 9, 2025 · Nov 9, 2025
diff --git a/.DS_Store b/.DS_Store
diff --git a/.gitignore b/.gitignore
@@ -2,11 +2,13 @@
 .mypy_cache/
 .pytest_cache/
 __pycache__/
+*.DS_Store
 .ipynb_checkpoints/
 .hypothesis/
 *.egg-info/
 build/
 .venv/
 uv.lock
 .env
-src/benchmarking/agent_chat/logs
+src/benchmark/simple_benchmarking/agent_chat/logs
+src/benchmark/tool_plan_benchmarking/logs
diff --git a/AGENTS.md b/AGENTS.md
@@ -0,0 +1,65 @@
+# Contributor guide (AGENTS)
+
+## Project structure
+
+- `src/` – main package code.
+  - `src/summarize_algorithms/` – dialogue summarization implementations (e.g. `memory_bank/`, `recsum/`, shared `core/`).
+  - `src/benchmarking/` – evaluation scripts, metrics, log parsing, and plotting.
+  - `src/utils/` – small shared helpers (logging/config parsing).
+  - Entry point: `src/main.py` (also exposed as a script in `pyproject.toml`: `recapkt = "src.main:main"`).
+- `tests/` – pytest suite (files follow `test_*.py`).
+- `requirements.txt`, `requirements.dev.txt` – runtime/dev dependencies.
+
+## Build, test, and development commands
+
+This repo targets **Python >= 3.12** (see `pyproject.toml`). CI uses **uv**.
+
+- Create env + install deps (recommended):
+  ```bash
+  uv venv
+  uv pip install -r requirements.txt -r requirements.dev.txt
+  ```
+- Run the example entry point:
+  ```bash
+  python -m src.main
+  # or
+  uv run recapkt
+  ```
+- Lint / format (Ruff):
+  ```bash
+  ruff check .
+  ruff format .
+  ```
+- Type-check (Mypy):
+  ```bash
+  uv run mypy
+  ```
+- Run tests:
+  ```bash
+  uv run python -m pytest
+  ```
+- Tool-metrics benchmarking helper:
+  ```bash
+  ./run.sh <arg>   # runs src/benchmark/tool_plan_benchmarking/run.py
+  ```
+
+## Code style and naming
+
+- Formatting/linting: Ruff is the source of truth (line length **120**, double quotes).
+- Typing: keep functions typed; the project configuration disallows untyped defs in `src/`.
+- Naming:
+  - modules/files: `snake_case.py`
+  - classes: `CamelCase`
+  - tests: `tests/test_<unit>.py`, test functions `test_<behavior>()`
+
+## VCS: commits and pull requests
+
+- Commit messages follow a lightweight Conventional Commits style seen in history: `feat: ...`, `fix: ...`.
+- PRs should:
+  - describe the change + rationale,
+  - include how to reproduce/verify (commands or a minimal snippet),
+  - keep CI green (GitHub Actions runs `ruff check`, `mypy`, `pytest` on PRs).
+
+## Secrets and local config
+
+- Don’t commit `.env`. If your change needs new settings, document them and keep defaults safe.
diff --git a/Makefile b/Makefile
@@ -0,0 +1,31 @@
+.PHONY: help run-main run-tool-plan run-tool-metrics run-tool-metrics-sh test
+
+# Prefer local venv if present, fall back to system python.
+PYTHON := $(shell [ -x .venv/bin/python ] && echo .venv/bin/python || (command -v python3 >/dev/null 2>&1 && echo python3 || echo python))
+
+# Optional args for some targets:
+#   make run-tool-metrics ARG=base_recsum
+ARG ?= base_recsum
+
+help:
+	@echo "Available targets:"
+	@echo "  make run-main"
+	@echo "  make run-tool-plan"
+	@echo "  make run-tool-metrics ARG=<arg>"
+	@echo "  make run-tool-metrics-sh ARG=<arg>"
+	@echo "  make test"
+
+run-main:
+	$(PYTHON) -m src.main
+
+run-tool-plan:
+	cd src/benchmark/tool_plan_benchmarking && $(PYTHON) -m run.py
+
+run-tool-metrics:
+	$(MAKE) run-tool-plan
+
+run-tool-metrics-sh:
+	./run.sh $(ARG)
+
+test:
+	$(PYTHON) -m pytest -q
diff --git a/pyproject.toml b/pyproject.toml
@@ -8,8 +8,32 @@ authors = [
     { name = "Mikhail Kharlamov" }
 ]
 readme = "README.md"
-requires-python = ">=3.8"
-dependencies = []
+requires-python = ">=3.12"
+dependencies = [
+    "colorlog~=6.10.1",
+    "dataclasses-json~=0.6.7",
+    "datasets==4.0.0",
+    "faiss-cpu==1.11.0",
+    "jinja2==3.1.6",
+    "langchain>=1.1.0",
+    "langchain-community>=0.4.1",
+    "langchain-core>=1.1.0",
+    "langchain-ollama>=1.0.1",
+    "langchain-openai>=1.0.0",
+    "langgraph>=1.0.0",
+    "load-dotenv>=0.1.0",
+    "matplotlib~=3.10.7",
+    "numpy>=1.26.2",
+    "openai~=1.109.1",
+    "pandas~=2.3.3",
+    "pydantic~=2.11.9",
+    "pytest>=9.0.2",
+    "python-dotenv~=1.1.1",
+    "scikit-learn==1.5.2",
+    "seaborn~=0.13.2",
+    "tiktoken==0.9.0",
+    "transformers>=4.57.6",
+]
 
 [project.scripts]
 recapkt = "src.main:main"
@@ -48,5 +72,9 @@ warn_no_return = "False"
 no_implicit_optional = "False"
 
 [tool.pytest.ini_options]
+pythonpath = ["src"]
 testpaths = ["tests"]
-addopts = ["--color=yes", "-s"]
+addopts = ["--color=yes", "-s"]
+
+[dependency-groups]
+dev = []
diff --git a/requirements.txt b/requirements.txt
@@ -1,14 +1,26 @@
-langchain-core>=0.3.72,<1.0.0
-langchain-openai==0.3.28
-langgraph==0.5.3
-langchain>=0.3.27,<0.4.0
+langchain-openai>=1.0.0
+langchain-core>=1.1.0,<2.0.0
+langchain-openai>=0.3.28
+langchain-ollama>=1.0.1
+langgraph>=1.0.0
+langchain>=1.1.0,<2.0.0
+langchain-community>=0.4.1
+
 tiktoken==0.9.0
+transformers
 datasets==4.0.0
 numpy>=1.26.2
 scikit-learn==1.5.2
 faiss-cpu==1.11.0
-langchain-community~=0.3.31
 pydantic~=2.11.9
-pytest~=8.3.4
 dataclasses-json~=0.6.7
-openai~=1.109.1
+openai~=1.109.1
+jinja2==3.1.6
+
+python-dotenv~=1.1.1
+
+colorlog~=6.10.1
+seaborn~=0.13.2
+pandas~=2.3.3
+matplotlib~=3.10.7
+load_dotenv
diff --git a/run.sh b/run.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+set -euo pipefail
+
+ROOT_DIR="$(cd "$(dirname "$0")" && pwd)"
+
+source "$ROOT_DIR/.venv/bin/activate"
+export PYTHONPATH="$ROOT_DIR"
+
+cd "$ROOT_DIR/src/benchmark/tool_plan_benchmarking"
+python run.py "${1:-}"
diff --git a/src/.DS_Store b/src/.DS_Store
diff --git a/src/benchmarking/__init__.py → src/algorithms/__init__.py b/src/benchmarking/__init__.py → src/algorithms/__init__.py
diff --git a/src/algorithms/dialogue.py b/src/algorithms/dialogue.py
@@ -0,0 +1,22 @@
+from typing import Any, Protocol
+
+from src.algorithms.summarize_algorithms.core.models import DialogueState, Session
+
+
+class Dialogue(Protocol):
+    """
+    Minimal public interface for a dialogue system used throughout benchmark.
+
+    Any implementation must expose a `system_name` and provide `process_dialogue()` returning a `DialogueState`.
+    """
+
+    system_name: str
+
+    def process_dialogue(
+            self,
+            sessions: list[Session],
+            system_prompt: str,
+            structure: dict[str, Any] | None = None,
+            tools: list[dict[str, Any]] | None = None
+    ) -> DialogueState:
+        ...
diff --git a/src/benchmarking/agent_chat/__init__.py → src/algorithms/simple_algorithms/__init__.py b/src/benchmarking/agent_chat/__init__.py → src/algorithms/simple_algorithms/__init__.py
diff --git a/src/algorithms/simple_algorithms/dialog_short_tools.py b/src/algorithms/simple_algorithms/dialog_short_tools.py
@@ -0,0 +1,31 @@
+from typing import override
+
+from langchain_core.messages import BaseMessage, ToolMessage
+
+from src.algorithms.simple_algorithms.dialogue_baseline import DialogueBaseline
+from src.algorithms.summarize_algorithms.core.models import Session
+
+
+class DialogueWithShortTools(DialogueBaseline):
+    """
+    Baseline variant that shortens tool messages.
+
+    Keeps tool call structure in the history but clears `ToolMessage.content` to reduce context length.
+    """
+
+    @override
+    @staticmethod
+    def _compress(sessions: list[Session]) -> list[BaseMessage]:
+        """
+        Compress sessions by clearing tool message contents.
+
+        :param sessions: past sessions.
+        :return: list[BaseMessage]: flattened history with shortened tool messages.
+        """
+        messages: list[BaseMessage] = DialogueBaseline._get_context(sessions)
+
+        for message in messages:
+            if isinstance(message, ToolMessage):
+                message.content = ""
+
+        return messages
diff --git a/src/algorithms/simple_algorithms/dialog_with_weights.py b/src/algorithms/simple_algorithms/dialog_with_weights.py
@@ -0,0 +1,49 @@
+from decimal import Decimal
+from math import ceil
+from typing import override
+
+from langchain_core.messages import BaseMessage, HumanMessage
+
+from src.algorithms.simple_algorithms.dialogue_baseline import DialogueBaseline
+from src.algorithms.summarize_algorithms.core.models import Session
+
+
+class DialogueWithWeights(DialogueBaseline):
+    """
+    Baseline variant that compresses history by truncating message contents with a positional weight.
+
+    Messages closer to the center of the conversation get truncated more aggressively (triangle-shaped coefficient).
+    Human messages are preserved.
+    """
+
+    @override
+    @staticmethod
+    def _compress(sessions: list[Session]) -> list[BaseMessage]:
+        """
+        Compress sessions by truncating non-human messages based on their position.
+
+        :param sessions: past sessions.
+        :return: list[BaseMessage]: flattened history with weighted truncation applied.
+        """
+        messages: list[BaseMessage] = DialogueBaseline._get_context(sessions)
+        cropped_messages: list[BaseMessage] = []
+
+        mid: int = (len(messages) - 1) // 2
+        step: Decimal = Decimal(1) / Decimal(mid)
+        coefficient: Decimal = Decimal(1)
+
+        for i in range(len(messages)):
+            if coefficient > 0 and i != 0:
+                coefficient -= step
+            else:
+                coefficient += step
+
+            message = messages[i]
+            if isinstance(message, HumanMessage):
+                cropped_messages.append(message)
+                continue
+
+            message.content = message.content[:ceil(len(message.content) * coefficient)]
+            cropped_messages.append(message)
+
+        return cropped_messages