llm-jp
diff --git a/‎.github/workflows/test.yml‎
Lines changed: 14 additions & 4 deletions b/‎.github/workflows/test.yml‎
Lines changed: 14 additions & 4 deletions
diff --git a/‎.gitignore‎
Lines changed: 40 additions & 1 deletion b/‎.gitignore‎
Lines changed: 40 additions & 1 deletion
diff --git a/‎.gitmodules‎
Lines changed: 7 additions & 0 deletions b/‎.gitmodules‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎AGENTS.md‎
Lines changed: 68 additions & 0 deletions b/‎AGENTS.md‎
Lines changed: 68 additions & 0 deletions
diff --git a/‎CONTRIBUTORS.md‎
Lines changed: 38 additions & 0 deletions b/‎CONTRIBUTORS.md‎
Lines changed: 38 additions & 0 deletions
diff --git a/‎examples/vila.py‎ renamed to ‎examples/_depr_vila.py‎
Lines changed: 4 additions & 3 deletions b/‎examples/vila.py‎ renamed to ‎examples/_depr_vila.py‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎examples/aya_vision.py‎
Lines changed: 84 additions & 0 deletions b/‎examples/aya_vision.py‎
Lines changed: 84 additions & 0 deletions
diff --git a/‎examples/base_vllm.py‎
Lines changed: 8 additions & 2 deletions b/‎examples/base_vllm.py‎
Lines changed: 8 additions & 2 deletions
@@ -7,16 +7,26 @@ jobs:
   uv-example:
     name: python
     runs-on: ubuntu-latest
+    env:
+      HF_TOKEN: ${{ secrets.HF_TOKEN }}
+      HUGGINGFACE_HUB_TOKEN: ${{ secrets.HF_TOKEN }}
 
     steps:
       - uses: actions/checkout@v4
+        with:
+          submodules: recursive
 
       - name: Install uv
         uses: astral-sh/setup-uv@v5
 
-      - name: Install the project
-        run: uv sync --dev
+      - name: Prepare submodules
+        run: git submodule update --init --recursive
 
-      - name: Run tests
-        # For example, using `pytest`
+      - name: Install the project (dev deps)
+        run: uv sync --group dev
+
+      - name: Run tests (metrics)
         run: uv run pytest src/eval_mm/metrics/*.py
+
+      - name: Run tests (tasks)
+        run: bash test_git_workflow.sh
@@ -101,6 +101,9 @@ venv.bak/
 # mypy
 .mypy_cache/
 
+# Ruff
+.ruff_cache/
+
 # weights and biases
 wandb/
 outputs/
@@ -123,7 +126,9 @@ tmp/
 *verbose.jsonl
 __depr__/
 
-# examples/llava for evaluating LLM-jp-3 VILA
+# examples/vila for evaluating LLM-jp-3 VILA
+examples/vila/*
+# optional official LLaVA submodule
 examples/llava/*
 
 # experiments
@@ -142,3 +147,37 @@ uv.lock
 
 # cache
 .cache/
+tmp*/
+
+# pytest cache
+.pytest_cache/
+
+# OS/editor files
+.DS_Store
+Thumbs.db
+.idea/
+
+# Streamlit local config
+.streamlit/
+
+# Local artifacts
+artifact/
+artifact/*_tmp_*/
+outputs/*_tmp_*/
+result/*_tmp_*/
+
+# Personal dev settings
+.python-version
+.uv/
+.uv
+.direnv/
+.envrc
+.devcontainer/
+*.code-workspace
+*.swp
+*.swo
+*~
+ 
+# HPC scripts (ignored per request)
+scripts/nvlink/
+scripts/tsubame/
@@ -0,0 +1,7 @@
+[submodule "examples/llava"]
+	path = examples/llava
+	url = https://github.com/haotian-liu/LLaVA.git
+[submodule "examples/vila"]
+	path = examples/vila
+	url = https://github.com/llm-jp/llm-jp-VILA.git
+	branch = dep-relax
@@ -0,0 +1,68 @@
+# Repository Guidelines
+
+This is a concise, coding‑agent–friendly guide for contributing and extending the llm-jp-eval-mm evaluation framework.
+
+## Project Structure
+
+- `src/eval_mm/`: Core library
+  - `tasks/`: Task loaders/adapters; register in `task_registry.py`
+  - `metrics/`: Scorers and aggregation utilities; register in `scorer_registry.py`
+  - `utils/`: Helpers (e.g., Azure/OpenAI client)
+- `examples/`: Reference VLM wrappers and runnable samples
+  - `vila/`: llm-jp VILA wrapper (submodule)
+  - `llava/`: official LLaVA (optional submodule)
+- `scripts/`: Leaderboard, Streamlit browser, dataset prep
+- `assets/`, `data/`, `dataset/`: Static assets and datasets (not committed)
+- `result/`, `outputs/`: Evaluation artifacts written by runs
+
+## Key Commands
+
+- Setup: `uv sync` (model deps via groups, e.g., `uv sync --group normal`)
+- Run sample eval: `uv run --group normal python examples/sample.py ...`
+- Tests: `bash test.sh` (tasks/metrics), `bash test_model.sh` (model smoke)
+- Lint/format: `uv run ruff format src && uv run ruff check --fix src`
+- Type check: `uv run mypy src`
+- Browse predictions: `uv run streamlit run scripts/browse_prediction.py -- --task_id <id> --result_dir result --model_list <model>`
+- Leaderboard: `python scripts/make_leaderboard.py --result_dir result`
+
+## Development Playbook (for Agents)
+
+- Add a task: implement `Task` in `src/eval_mm/tasks/<name>.py`; import it in `src/eval_mm/tasks/__init__.py`; register with `@register_task` in `task_registry.py`.
+- Add a scorer: implement in `src/eval_mm/metrics/<name>_scorer.py`; import in `metrics/__init__.py`; register in `scorer_registry.py`.
+- Add a model: wrap in `examples/` (see existing VLM wrappers) and map via `examples/model_table.py`.
+- Import pattern: `from eval_mm import TaskRegistry, ScorerRegistry` (avoid `src.` prefixes).
+- Tests: include `def test_*` near tasks/metrics; prefer `bash test.sh` (tasks/metrics) and `bash test_model.sh` (model smoke). For a single file, you may optionally run `uv run --group dev pytest <path> -v`, but CI expects the scripts.
+
+## Plan-First Workflow
+
+- Before any change, prepare a short checklist: objective, source of truth, inventory, diff policy, implementation steps, and acceptance criteria.
+- After alignment, implement the minimum needed for the agreed scope.
+- Example (naming unification):
+  - Source of truth: treat `scripts/nvlink/config.sh` entries (e.g., task IDs and metric map) as canonical.
+  - Inventory: compare identifiers used across code and configuration, and list discrepancies.
+  - Implementation: adopt the canonical identifiers in public-facing interfaces; keep backward-compatible aliases only if necessary.
+  - Validation: run `uv run python scripts/validate_config_consistency.py` and `bash test.sh`.
+
+## Coding Style & Conventions
+
+- Python ≥ 3.12, 4‑space indentation, type hints required
+- Names: packages/modules `lower_snake_case`; classes `CamelCase`; functions/vars `lower_snake_case`
+- Keep functions focused; prefer dataclasses/typed types for structured data
+- Use Ruff + pre-commit; follow existing import order and ignore rules
+
+## Commit & PR Guidelines
+
+- Prefix commits with `feat:`, `fix:`, `chore:`, `docs:` (see `git log`)
+- PRs include: clear description, linked issues, repro commands, sample outputs (e.g., `result/<task>/<model>/evaluation.jsonl`); CI must pass
+
+## Security & Config
+
+- LLM‑as‑a‑Judge: set `.env` with `AZURE_OPENAI_ENDPOINT`/`AZURE_OPENAI_KEY` or `OPENAI_API_KEY`
+- Do not commit secrets or large datasets; use `.env.sample`
+- Add model deps via `uv` groups and update conflicts in `pyproject.toml`
+
+## Temporary Validation (_tmp_ Policy)
+
+- Name temporary files/dirs with `_tmp_` (e.g., `result/<task>/<model>_tmp_/<run>`).
+- Keep them under `result/`, `outputs/`, or `artifact/` and remove after validation.
+- Avoid committing `_tmp_` artifacts; they are ignored by `.gitignore`.
@@ -0,0 +1,38 @@
+# Contributors (CRediT Taxonomy)
+
+本ファイルは CRediT Taxonomy に準拠して、仮の貢献者情報を示します。依頼に基づき「プログラミング・実験・テスト・検証・文書作成」の貢献を以下の CRediT 役割に対応づけています。
+
+- プログラミング → Software
+- 実験 → Investigation
+- テスト → Software（Testing を含む）/ Validation（検証・再現性確認）
+- 検証 → Validation
+- 文書作成 → Writing – Original Draft
+
+- Koki Maeda (@Silviase)
+  - Software（プログラミング/テスト）
+  - Investigation（実験）
+  - Validation（検証/再現性確認）
+  - Writing – Original Draft
+
+- Issa Sugiura (@speed1313)
+  - Software
+  - Investigation
+  - Validation
+  - Writing – Original Draft
+
+- Yusuke Oda（@odashi）
+  - Conceptualization（全体構成/フレームワークに関する構想・合意）
+  - Methodology（評価フレームワーク/方針の設計）
+  - Supervision（助言・進行の監督）
+
+- Shuhei Kurita（@shuheikurita）
+  - Supervision（助言・進行の監督）
+  - Writing – Review & Editing（文書レビュー/編集）
+
+- Naoaki Okazaki（@chokkanorg）
+  - Supervision（助言・進行の監督）
+  - Writing – Review & Editing（文書レビュー/編集）
+
+備考:
+
+- 実プロジェクトでは、必要に応じて他の CRediT 役割（Conceptualization, Methodology, Data Curation, Resources, Supervision, Writing – Review & Editing など）も追記してください。
@@ -1,3 +1,4 @@
+# -------- DEPRECATED --------
 # This file is modified from https://github.com/haotian-liu/LLaVA/
 
 # rye add protobuf
@@ -8,13 +9,13 @@
 
 import torch
 
-from llava_vila.conversation import SeparatorStyle, conv_templates
-from llava_vila.mm_utils import (
+from vila.conversation import SeparatorStyle, conv_templates
+from vila.mm_utils import (
     get_model_name_from_path,
     process_images,
     tokenizer_image_token,
 )
-from llava_vila.model.builder import load_pretrained_model
+from vila.model.builder import load_pretrained_model
 from PIL import Image
 
 
 
@@ -0,0 +1,84 @@
+import torch
+from PIL import Image
+from transformers import AutoProcessor, AutoModelForImageTextToText
+from base_vlm import BaseVLM
+from utils import GenerationConfig
+
+
+class VLM(BaseVLM):
+    def __init__(self, model_id: str = "CohereLabs/aya-vision-8b") -> None:
+        self.model_id = model_id
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        
+        # Check for HuggingFace token
+        import os
+        token = os.getenv("HF_TOKEN", None)
+        
+        self.processor = AutoProcessor.from_pretrained(self.model_id, token=token)
+        self.model = AutoModelForImageTextToText.from_pretrained(
+            self.model_id, 
+            device_map="auto", 
+            torch_dtype=torch.float16,
+            low_cpu_mem_usage=True,
+            token=token
+        )
+
+    def generate(
+        self,
+        images: list[Image.Image] | None,
+        text: str,
+        gen_kwargs: GenerationConfig = GenerationConfig(),
+    ) -> str:
+        if images is None:
+            images = []
+        
+        # Build the prompt with special tokens (based on VLLM implementation)
+        # For multiple images, we need to add <image> token for each image
+        image_tokens = "<image>" * len(images) if images else ""
+        prompt = f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{image_tokens}{text}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>"
+        
+        # Process the inputs
+        inputs = self.processor(
+            text=prompt,
+            images=images if images else None,
+            return_tensors="pt",
+            padding=True
+        )
+        
+        # Move inputs to device
+        inputs = {k: v.to(self.device) if torch.is_tensor(v) else v for k, v in inputs.items()}
+        
+        # Generate response
+        with torch.no_grad():
+            outputs = self.model.generate(
+                **inputs,
+                max_new_tokens=gen_kwargs.max_new_tokens,
+                temperature=gen_kwargs.temperature,
+                top_p=gen_kwargs.top_p,
+                do_sample=gen_kwargs.do_sample if gen_kwargs.temperature > 0 else False,
+            )
+        
+        # Decode only the generated tokens
+        generated_ids = outputs[0][inputs['input_ids'].shape[1]:]
+        generated_text = self.processor.decode(generated_ids, skip_special_tokens=True)
+        
+        return generated_text.strip()
+
+    def batch_generate(
+        self,
+        images_list: list[list[Image.Image]] | None,
+        text_list: list[str],
+        gen_kwargs: GenerationConfig = GenerationConfig(),
+    ) -> list[str]:
+        # For batch processing, we'll process each item sequentially
+        # as the model may not support true batch processing with different image counts
+        results = []
+        for images, text in zip(images_list or [[] for _ in text_list], text_list):
+            result = self.generate(images, text, gen_kwargs)
+            results.append(result)
+        return results
+
+
+if __name__ == "__main__":
+    vlm = VLM()
+    vlm.test_vlm()
@@ -7,7 +7,12 @@
 
 
 class VLLM(BaseVLM):
-    def __init__(self, model_id: str = "google/gemma-3-4b-it") -> None:
+    def __init__(self, 
+        model_id: str,
+        gpu_memory_utilization: float = 0.9,
+        max_model_len: int = None,
+        tensor_parallel_size: int = 1,
+    ) -> None:
         self.model_id = model_id
         self.registry = VLLMModelRegistry(self.model_id)
         self.processor = self.registry.processor
@@ -16,7 +21,8 @@ def __init__(self, model_id: str = "google/gemma-3-4b-it") -> None:
         engine_config = self.registry.get_engine_config(self.model_id)
         self.engine_args_dict = {
             "model": self.model_id,
-            "tensor_parallel_size": 2,  # number of GPUs of the machine, but 40 should be divisible by tensor_parallel_size
+            "tensor_parallel_size": tensor_parallel_size,  # number of GPUs of the machine, but 40 should be divisible by tensor_parallel_size
+            "gpu_memory_utilization": gpu_memory_utilization,
             "download_dir": "./.cache/vllm",
             **engine_config,
         }