Load outputs.json (or similar) to browse claims.
The browser will decode tokens with the Llama 3.1 tokenizer and highlight claim spans.
+diff --git a/browser/README.md b/browser/README.md new file mode 100644 index 0000000..a27df30 --- /dev/null +++ b/browser/README.md @@ -0,0 +1,47 @@ +# Claim Browser + +A small local web app for browsing retrievals and their extracted claims. + +## Prerequisites + +- Python 3.10+ +- Install project with tokenizer extras (needed for the Llama 3.1 tokenizer): + ```bash + pip install .[llm] + ``` + If you prefer a virtual environment, create/activate it first. + +## Run the browser + +```bash +python browser/server.py +``` + +What happens: +- Starts a local server at `http://localhost:5678` +- Opens your default browser automatically +- Serves the static UI from `browser/static` +- Exposes a `/decode` endpoint the UI calls to reconstruct text and token offsets + +## Load data + +1) Click “Load JSON file” in the top bar. +2) Select an outputs file matching the structure of `contest/outputs.json`: + - Top-level object or array of retrievals. + - Each retrieval includes: + - `retrieval` (string) + - `greedy_tokens` (list of token IDs for `meta-llama/Llama-3.1-70B-Instruct`) + - `claims` (list) where each claim has `aligned_token_ids` pointing into `greedy_tokens` and `claim_text` (or `decoded_claim`). + - Optional: `question`, `label`, `id`. + +## Using the UI + +- **Sidebar:** shows all retrievals; click to select. +- **Retrieval text:** tokenized and decoded via Llama tokenizer; claim tokens are highlighted. +- **Claim list:** hover a claim to isolate its spans in the retrieval; scrollable so you can hover while keeping the retrieval in view. +- If a claim references token IDs outside the available range, those tokens are ignored gracefully. + +## Troubleshooting + +- If the tokenizer download fails (no network), ensure the model is already cached locally or try again with network access. +- If the page doesn’t open automatically, open `http://localhost:5678` manually. diff --git a/browser/server.py b/browser/server.py new file mode 100644 index 0000000..c1296c4 --- /dev/null +++ b/browser/server.py @@ -0,0 +1,136 @@ +import json +import sys +import threading +import webbrowser +from functools import partial +from http.server import SimpleHTTPRequestHandler, ThreadingHTTPServer +from pathlib import Path +from typing import Iterable, List, Tuple + +try: + from transformers import AutoTokenizer +except ImportError as exc: # pragma: no cover - import guard + # Keep a clear message so users know how to install extras. + raise SystemExit( + "Transformers is required to run the claim browser. " + "Install the optional LLM extras with `pip install .[llm]`." + ) from exc + + +DEFAULT_PORT = 5678 +STATIC_DIR = Path(__file__).parent / "static" + + +def _validate_token_ids(raw_ids: Iterable) -> List[int]: + token_ids = list(raw_ids) + if not all(isinstance(t, int) for t in token_ids): + raise ValueError("token_ids must be a list of integers") + return token_ids + + +def decode_tokens(tokenizer, token_ids: List[int]) -> Tuple[str, List[dict]]: + """ + Decode tokens and return the reconstructed text plus per-token offsets. + + Offsets are built by progressively decoding the prefix of tokens. This + preserves spacing decisions made by the tokenizer. + """ + tokens = tokenizer.convert_ids_to_tokens(token_ids) + fragments: List[str] = [] + offsets: List[dict] = [] + + prev_text = "" + for idx in range(len(tokens)): + partial_text = tokenizer.convert_tokens_to_string(tokens[: idx + 1]) + piece = partial_text[len(prev_text) :] + fragments.append(piece) + prev_text = partial_text + + cursor = 0 + for idx, piece in enumerate(fragments): + start = cursor + cursor += len(piece) + offsets.append( + { + "index": idx, + "start": start, + "end": cursor, + "text": piece, + "token_id": token_ids[idx], + "token": tokens[idx], + } + ) + + full_text = "".join(fragments) + return full_text, offsets + + +class ClaimBrowserHandler(SimpleHTTPRequestHandler): + tokenizer = None + + def do_POST(self) -> None: # pragma: no cover - runtime path + if self.path != "/decode": + self.send_error(404, "Unknown endpoint") + return + + try: + content_length = int(self.headers.get("Content-Length", "0")) + payload = self.rfile.read(content_length) + data = json.loads(payload) + token_ids = _validate_token_ids(data.get("token_ids", [])) + except Exception as exc: # broad to simplify response + self.send_error(400, f"Invalid request: {exc}") + return + + try: + text, offsets = decode_tokens(self.tokenizer, token_ids) + except Exception as exc: # pragma: no cover - runtime path + self.send_error(500, f"Failed to decode tokens: {exc}") + return + + body = json.dumps({"text": text, "tokens": offsets}).encode("utf-8") + self.send_response(200) + self.send_header("Content-Type", "application/json") + self.send_header("Content-Length", str(len(body))) + self.end_headers() + self.wfile.write(body) + + +def _load_tokenizer(): + print("Loading tokenizer meta-llama/Llama-3.1-70B-Instruct ...") + tokenizer = AutoTokenizer.from_pretrained( + "meta-llama/Llama-3.1-70B-Instruct" + ) + return tokenizer + + +def main(port: int = DEFAULT_PORT) -> None: # pragma: no cover - runtime entrypoint + if not STATIC_DIR.exists(): + raise SystemExit(f"Static assets not found in {STATIC_DIR}") + + tokenizer = _load_tokenizer() + ClaimBrowserHandler.tokenizer = tokenizer + + handler_class = partial(ClaimBrowserHandler, directory=str(STATIC_DIR)) + server = ThreadingHTTPServer(("127.0.0.1", port), handler_class) + + url = f"http://localhost:{port}" + print(f"Serving claim browser at {url}") + threading.Thread(target=lambda: webbrowser.open(url), daemon=True).start() + + try: + server.serve_forever() + except KeyboardInterrupt: + print("\nShutting down...") + finally: + server.server_close() + + +if __name__ == "__main__": # pragma: no cover - CLI execution + chosen_port = DEFAULT_PORT + if len(sys.argv) == 2: + try: + chosen_port = int(sys.argv[1]) + except ValueError: + raise SystemExit("Port must be an integer") + main(port=chosen_port) diff --git a/browser/static/index.html b/browser/static/index.html new file mode 100644 index 0000000..3d0c178 --- /dev/null +++ b/browser/static/index.html @@ -0,0 +1,50 @@ + + +
+ + +Load outputs.json (or similar) to browse claims.
The browser will decode tokens with the Llama 3.1 tokenizer and highlight claim spans.
+Pick a JSON file to get started.