stat-ml · silvimica · Nov 27, 2025 · Nov 28, 2025 · Nov 28, 2025 · Nov 28, 2025
diff --git a/browser/README.md b/browser/README.md
@@ -0,0 +1,47 @@
+# Claim Browser
+
+A small local web app for browsing retrievals and their extracted claims.
+
+## Prerequisites
+
+- Python 3.10+
+- Install project with tokenizer extras (needed for the Llama 3.1 tokenizer):
+  ```bash
+  pip install .[llm]
+  ```
+  If you prefer a virtual environment, create/activate it first.
+
+## Run the browser
+
+```bash
+python browser/server.py
+```
+
+What happens:
+- Starts a local server at `http://localhost:5678`
+- Opens your default browser automatically
+- Serves the static UI from `browser/static`
+- Exposes a `/decode` endpoint the UI calls to reconstruct text and token offsets
+
+## Load data
+
+1) Click “Load JSON file” in the top bar.
+2) Select an outputs file matching the structure of `contest/outputs.json`:
+   - Top-level object or array of retrievals.
+   - Each retrieval includes:
+     - `retrieval` (string)
+     - `greedy_tokens` (list of token IDs for `meta-llama/Llama-3.1-70B-Instruct`)
+     - `claims` (list) where each claim has `aligned_token_ids` pointing into `greedy_tokens` and `claim_text` (or `decoded_claim`).
+   - Optional: `question`, `label`, `id`.
+
+## Using the UI
+
+- **Sidebar:** shows all retrievals; click to select.
+- **Retrieval text:** tokenized and decoded via Llama tokenizer; claim tokens are highlighted.
+- **Claim list:** hover a claim to isolate its spans in the retrieval; scrollable so you can hover while keeping the retrieval in view.
+- If a claim references token IDs outside the available range, those tokens are ignored gracefully.
+
+## Troubleshooting
+
+- If the tokenizer download fails (no network), ensure the model is already cached locally or try again with network access.
+- If the page doesn’t open automatically, open `http://localhost:5678` manually.
diff --git a/browser/server.py b/browser/server.py
@@ -0,0 +1,136 @@
+import json
+import sys
+import threading
+import webbrowser
+from functools import partial
+from http.server import SimpleHTTPRequestHandler, ThreadingHTTPServer
+from pathlib import Path
+from typing import Iterable, List, Tuple
+
+try:
+    from transformers import AutoTokenizer
+except ImportError as exc:  # pragma: no cover - import guard
+    # Keep a clear message so users know how to install extras.
+    raise SystemExit(
+        "Transformers is required to run the claim browser. "
+        "Install the optional LLM extras with `pip install .[llm]`."
+    ) from exc
+
+
+DEFAULT_PORT = 5678
+STATIC_DIR = Path(__file__).parent / "static"
+
+
+def _validate_token_ids(raw_ids: Iterable) -> List[int]:
+    token_ids = list(raw_ids)
+    if not all(isinstance(t, int) for t in token_ids):
+        raise ValueError("token_ids must be a list of integers")
+    return token_ids
+
+
+def decode_tokens(tokenizer, token_ids: List[int]) -> Tuple[str, List[dict]]:
+    """
+    Decode tokens and return the reconstructed text plus per-token offsets.
+
+    Offsets are built by progressively decoding the prefix of tokens. This
+    preserves spacing decisions made by the tokenizer.
+    """
+    tokens = tokenizer.convert_ids_to_tokens(token_ids)
+    fragments: List[str] = []
+    offsets: List[dict] = []
+
+    prev_text = ""
+    for idx in range(len(tokens)):
+        partial_text = tokenizer.convert_tokens_to_string(tokens[: idx + 1])
+        piece = partial_text[len(prev_text) :]
+        fragments.append(piece)
+        prev_text = partial_text
+
+    cursor = 0
+    for idx, piece in enumerate(fragments):
+        start = cursor
+        cursor += len(piece)
+        offsets.append(
+            {
+                "index": idx,
+                "start": start,
+                "end": cursor,
+                "text": piece,
+                "token_id": token_ids[idx],
+                "token": tokens[idx],
+            }
+        )
+
+    full_text = "".join(fragments)
+    return full_text, offsets
+
+
+class ClaimBrowserHandler(SimpleHTTPRequestHandler):
+    tokenizer = None
+
+    def do_POST(self) -> None:  # pragma: no cover - runtime path
+        if self.path != "/decode":
+            self.send_error(404, "Unknown endpoint")
+            return
+
+        try:
+            content_length = int(self.headers.get("Content-Length", "0"))
+            payload = self.rfile.read(content_length)
+            data = json.loads(payload)
+            token_ids = _validate_token_ids(data.get("token_ids", []))
+        except Exception as exc:  # broad to simplify response
+            self.send_error(400, f"Invalid request: {exc}")
+            return
+
+        try:
+            text, offsets = decode_tokens(self.tokenizer, token_ids)
+        except Exception as exc:  # pragma: no cover - runtime path
+            self.send_error(500, f"Failed to decode tokens: {exc}")
+            return
+
+        body = json.dumps({"text": text, "tokens": offsets}).encode("utf-8")
+        self.send_response(200)
+        self.send_header("Content-Type", "application/json")
+        self.send_header("Content-Length", str(len(body)))
+        self.end_headers()
+        self.wfile.write(body)
+
+
+def _load_tokenizer():
+    print("Loading tokenizer meta-llama/Llama-3.1-70B-Instruct ...")
+    tokenizer = AutoTokenizer.from_pretrained(
+        "meta-llama/Llama-3.1-70B-Instruct"
+    )
+    return tokenizer
+
+
+def main(port: int = DEFAULT_PORT) -> None:  # pragma: no cover - runtime entrypoint
+    if not STATIC_DIR.exists():
+        raise SystemExit(f"Static assets not found in {STATIC_DIR}")
+
+    tokenizer = _load_tokenizer()
+    ClaimBrowserHandler.tokenizer = tokenizer
+
+    handler_class = partial(ClaimBrowserHandler, directory=str(STATIC_DIR))
+    server = ThreadingHTTPServer(("127.0.0.1", port), handler_class)
+
+    url = f"http://localhost:{port}"
+    print(f"Serving claim browser at {url}")
+    threading.Thread(target=lambda: webbrowser.open(url), daemon=True).start()
+
+    try:
+        server.serve_forever()
+    except KeyboardInterrupt:
+        print("\nShutting down...")
+    finally:
+        server.server_close()
+
+
+if __name__ == "__main__":  # pragma: no cover - CLI execution
+    chosen_port = DEFAULT_PORT
+    if len(sys.argv) == 2:
+        try:
+            chosen_port = int(sys.argv[1])
+        except ValueError:
+            raise SystemExit("Port must be an integer")
+    main(port=chosen_port)
diff --git a/browser/static/index.html b/browser/static/index.html
@@ -0,0 +1,50 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+  <meta charset="UTF-8" />
+  <meta name="viewport" content="width=device-width, initial-scale=1.0" />
+  <title>Claim Browser</title>
+  <link rel="stylesheet" href="/styles.css" />
+</head>
+<body>
+  <header class="topbar">
+    <div class="brand">
+      <span class="dot"></span>
+      <span>Claim Browser</span>
+    </div>
+    <label class="file-control">
+      <input type="file" id="file-input" accept=".json,application/json" />
+      <span class="file-label">Load JSON file</span>
+    </label>
+    <div id="status" class="status"></div>
+  </header>
+
+  <main class="layout">
+    <aside class="sidebar">
+      <div class="sidebar-header">Retrievals</div>
+      <ul id="retrieval-list" class="retrieval-list"></ul>
+    </aside>
+    <section class="content">
+      <div class="content-header">
+        <div>
+          <div class="label">Selected retrieval</div>
+          <div id="retrieval-title" class="title">No file loaded</div>
+        </div>
+        <div id="retrieval-question" class="question"></div>
+      </div>
+      <div id="retrieval-text" class="retrieval-text empty-state">
+        <div class="placeholder">
+          <p>Load <code>outputs.json</code> (or similar) to browse claims.</p>
+          <p>The browser will decode tokens with the Llama 3.1 tokenizer and highlight claim spans.</p>
+        </div>
+      </div>
+      <div class="claims-section">
+        <div class="claims-header">Claims</div>
+        <ul id="claims-list" class="claims-list"></ul>
+      </div>
+    </section>
+  </main>
+
+  <script src="/main.js" defer></script>
+</body>
+</html>