diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..9331359 --- /dev/null +++ b/.gitignore @@ -0,0 +1,89 @@ +# Python +__pycache__/ +*.py[cod] +*.pyo +*.pyd +.Python +*.egg-info/ +dist/ +build/ +*.whl + +# Virtual environment +venv/ +.env/ +env/ +.venv/ + +# IDE and editor files +.vscode/ +.idea/ +*.sublime-project +*.sublime-workspace + +# OS-specific files +.DS_Store +Thumbs.db + +# Hydra outputs +outputs/ +multirun/ +experiments/finetune_vlm/logs/ + +# Weights & Biases (W&B) logs +wandb/ +wandb/* +run-*.wandb +wandb/run-* + +# Model checkpoints and large files +experiments/finetune_vlm/qwen2_0_5b_lora/ +*.safetensors +*.bin +*.pt +*.pth +model.safetensors +checkpoints/ + +# Datasets +data/ +*.zip +*.tar.gz +*.gz +*.tgz + +# Temporary files +*.tmp +*.temp +*.bak +*.swp +*~ + +# Logs +*.log +logs/ + +# Sensitive files +*.env +*.secret +secrets.yaml +config/secrets/ + +# Jupyter notebooks checkpoints +.ipynb_checkpoints/ + +# Coverage and testing +.coverage +coverage.xml +*.cover +*.pytest_cache/ +htmlcov/ + +# Documentation build +docs/_build/ +docs/build/ + +# Miscellaneous +*.cache +.cache/ +*.prof \ No newline at end of file diff --git a/configs/config.yaml b/configs/config.yaml index e69de29..afda3ec 100644 --- a/configs/config.yaml +++ b/configs/config.yaml @@ -0,0 +1,16 @@ +defaults: + - _self_ + - training + - model: qwen2_0_5b + - dataset: coco_tiny +wandb: + project: vlm_finetune_experiments + team: JAC-VISION + + + +experiment: null + +hydra: + job: + chdir: False \ No newline at end of file diff --git a/configs/dataset/coco_tiny.yaml b/configs/dataset/coco_tiny.yaml new file mode 100644 index 0000000..05a6310 --- /dev/null +++ b/configs/dataset/coco_tiny.yaml @@ -0,0 +1,4 @@ +dataset: + name: coco_tiny + size: 100 + type: image_text \ No newline at end of file diff --git a/configs/model/qwen2_0_5b.yaml b/configs/model/qwen2_0_5b.yaml new file mode 100644 index 0000000..a9d7b24 --- /dev/null +++ b/configs/model/qwen2_0_5b.yaml @@ -0,0 +1,5 @@ +model: + name: qwen2_0_5b + pretrained_path: Qwen/Qwen2-0.5B + vision_model_path: openai/clip-vit-base-patch32 + size: 0.5B \ No newline at end of file diff --git a/configs/sweep/vlm_sweep.yaml b/configs/sweep/vlm_sweep.yaml new file mode 100644 index 0000000..4153cfa --- /dev/null +++ b/configs/sweep/vlm_sweep.yaml @@ -0,0 +1,7 @@ +sweep: + lr: + - 1e-5 + - 5e-5 + batch_size: + - 1 + - 2 \ No newline at end of file diff --git a/configs/training.yaml b/configs/training.yaml new file mode 100644 index 0000000..d35d2af --- /dev/null +++ b/configs/training.yaml @@ -0,0 +1,4 @@ +training: + epochs: 3 + lr: 1e-5 + batch_size: 1 \ No newline at end of file diff --git a/core/runner.py b/core/runner.py new file mode 100644 index 0000000..1214259 --- /dev/null +++ b/core/runner.py @@ -0,0 +1,26 @@ +import hydra +from omegaconf import DictConfig +import yaml +import importlib +import os + +def load_registry(registry_path: str): + # Use absolute path relative to runner.py + abs_path = os.path.join(os.path.dirname(__file__), '..', 'experiment_registry.yaml') + with open(abs_path, 'r') as f: + registry = yaml.safe_load(f) + return registry + +def run_experiment(cfg: DictConfig): + registry = load_registry('experiment_registry.yaml') + experiment_name = cfg.experiment + + for exp in registry['experiments']: + if exp['name'] == experiment_name and exp['active']: + module_path = exp['module'] + module_name, func_name = module_path.rsplit('.', 1) + module = importlib.import_module(module_name) + func = getattr(module, func_name) + return func(cfg) + + raise ValueError(f"Experiment '{experiment_name}' not found or inactive in registry") \ No newline at end of file diff --git a/experiment_registry.yaml b/experiment_registry.yaml index dc92040..f67b0b2 100644 --- a/experiment_registry.yaml +++ b/experiment_registry.yaml @@ -13,3 +13,8 @@ experiments: description: "Vary training image count to study performance scaling" module: data_impact active: true + + - name: finetune_vlm + description: "Fine-tune Qwen2-0.5B with CLIP-ViT for VLM tasks using LoRA" + module: experiments.finetune_vlm.run + active: true \ No newline at end of file diff --git a/experiments/finetune_vlm/__init__.py b/experiments/finetune_vlm/__init__.py new file mode 100644 index 0000000..df0149f --- /dev/null +++ b/experiments/finetune_vlm/__init__.py @@ -0,0 +1,4 @@ +from .run import run + +# This file makes experiments.finetune_vlm a Python package +# Explicitly importing run ensures experiments.finetune_vlm.run is accessible \ No newline at end of file diff --git a/experiments/finetune_vlm/config.yaml b/experiments/finetune_vlm/config.yaml new file mode 100644 index 0000000..95f9d42 --- /dev/null +++ b/experiments/finetune_vlm/config.yaml @@ -0,0 +1,12 @@ +defaults: + - _self_ + - /model: qwen2_0_5b + - /dataset: coco_tiny + - /training + +wandb: + project: vlm_finetune_experiments + group: finetune_vlm + team: JAC-VISION + +output_dir: experiments/finetune_vlm/qwen2_0_5b_lora \ No newline at end of file diff --git a/experiments/finetune_vlm/logic.py b/experiments/finetune_vlm/logic.py new file mode 100644 index 0000000..d23e14a --- /dev/null +++ b/experiments/finetune_vlm/logic.py @@ -0,0 +1,137 @@ +import torch +import torch.nn as nn +from omegaconf import DictConfig +import wandb +from transformers import AutoModelForCausalLM, AutoTokenizer, CLIPVisionModel, CLIPProcessor +from peft import LoraConfig, get_peft_model +from torch.utils.data import Dataset, DataLoader +import numpy as np +from PIL import Image +import io +import os + +class DummyCOCODataset(Dataset): + def __init__(self, size=100): + self.size = size + # Synthetic data: dummy images and captions + self.images = [Image.new('RGB', (64, 64), color='gray') for _ in range(size)] + self.captions = [f"Object {i}" for i in range(size)] + + def __len__(self): + return self.size + + def __getitem__(self, idx): + return {'image': self.images[idx], 'text': self.captions[idx]} + +def custom_collate_fn(batch): + # Collate images (PIL.Image.Image) and texts (strings) into lists + images = [item['image'] for item in batch] + texts = [item['text'] for item in batch] + return {'image': images, 'text': texts} + +class VLM(nn.Module): + def __init__(self, language_model, vision_model): + super().__init__() + self.language_model = language_model + self.vision_model = vision_model + self.projection = nn.Linear(768, language_model.config.hidden_size, dtype=torch.float16) # CLIP to Qwen, match float16 + + def forward(self, input_ids, attention_mask, pixel_values, labels=None): + # Encode image + vision_outputs = self.vision_model(pixel_values=pixel_values) + image_hidden_states = vision_outputs.pooler_output + image_embeds = self.projection(image_hidden_states) + + # Combine with text (simplified: pass text only to language model) + inputs = {'input_ids': input_ids, 'attention_mask': attention_mask} + outputs = self.language_model(**inputs, labels=labels) + return outputs + +class VLMFinetuneExperiment: + def __init__(self, cfg: DictConfig): + self.cfg = cfg + self.device = torch.device("cpu") # Force CPU for Intel Iris + + # Load models + self.language_model = AutoModelForCausalLM.from_pretrained( + cfg.model.model.pretrained_path, + torch_dtype=torch.float16, + low_cpu_mem_usage=True + ) + self.tokenizer = AutoTokenizer.from_pretrained(cfg.model.model.pretrained_path) + self.vision_model = CLIPVisionModel.from_pretrained( + cfg.model.model.vision_model_path, + torch_dtype=torch.float16 + ).to(self.device) + self.processor = CLIPProcessor.from_pretrained(cfg.model.model.vision_model_path) + + # Combine into VLM + self.model = VLM(self.language_model, self.vision_model) + + # Apply LoRA to language model only + lora_config = LoraConfig( + r=8, + lora_alpha=16, + target_modules=["q_proj", "v_proj"], + lora_dropout=0.1, + bias="none" + ) + self.model.language_model = get_peft_model(self.model.language_model, lora_config) + self.model.to(self.device) + + # Dataset and loader + self.dataset = DummyCOCODataset(size=cfg.dataset.dataset.size) + self.dataloader = DataLoader( + self.dataset, + batch_size=cfg.training.batch_size, + shuffle=True, + collate_fn=custom_collate_fn + ) + + # Optimizer + self.optimizer = torch.optim.AdamW(self.model.parameters(), lr=cfg.training.lr) + + def train(self): + self.model.train() + for epoch in range(1, self.cfg.training.epochs + 1): + total_loss = 0 + correct = 0 + total = 0 + + for batch in self.dataloader: + images = batch['image'] + texts = batch['text'] + + # Process inputs + text_inputs = self.tokenizer(texts, return_tensors="pt", padding=True, truncation=True).to(self.device) + image_inputs = self.processor(images=images, return_tensors="pt").to(self.device) + + # Forward pass + outputs = self.model( + input_ids=text_inputs.input_ids, + attention_mask=text_inputs.attention_mask, + pixel_values=image_inputs.pixel_values, + labels=text_inputs.input_ids + ) + loss = outputs.loss + + # Backward pass + self.optimizer.zero_grad() + loss.backward() + self.optimizer.step() + + total_loss += loss.item() + + # Simplified accuracy + preds = torch.argmax(outputs.logits, dim=-1) + correct += (preds == text_inputs.input_ids).float().mean().item() + total += 1 + + avg_loss = total_loss / len(self.dataloader) + accuracy = correct / total if total > 0 else 0 + wandb.log({"epoch": epoch, "loss": avg_loss, "accuracy": accuracy}) + print(f"Epoch {epoch}, Loss: {avg_loss:.4f}, Accuracy: {accuracy:.4f}") + + # Save checkpoint + os.makedirs(self.cfg.output_dir, exist_ok=True) + self.model.language_model.save_pretrained(f"{self.cfg.output_dir}/epoch_{epoch}") \ No newline at end of file diff --git a/experiments/finetune_vlm/run.py b/experiments/finetune_vlm/run.py new file mode 100644 index 0000000..480edce --- /dev/null +++ b/experiments/finetune_vlm/run.py @@ -0,0 +1,9 @@ +from omegaconf import DictConfig +from .logic import VLMFinetuneExperiment +import wandb + +def run(cfg: DictConfig): + wandb.init(project=cfg.wandb.project, config=dict(cfg), group=cfg.experiment, entity=cfg.wandb.team) + experiment = VLMFinetuneExperiment(cfg) + experiment.train() + wandb.finish() \ No newline at end of file diff --git a/main.py b/main.py index bfb6b45..cbba80c 100644 --- a/main.py +++ b/main.py @@ -1,10 +1,10 @@ import hydra from omegaconf import DictConfig -from runner import run_experiment +from core.runner import run_experiment -@hydra.main(config_path="configs", config_name="config", version_base="1.3") +@hydra.main(config_path="configs", config_name="config", version_base="1.1") def main(cfg: DictConfig): run_experiment(cfg) if __name__ == "__main__": - main() + main() \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..b1b6a2c --- /dev/null +++ b/requirements.txt @@ -0,0 +1,8 @@ +torch>=2.0.0 +hydra-core>=1.3.0 +omegaconf>=2.3.0 +wandb>=0.15.0 +numpy>=1.24.0 +transformers>=4.36.0 +peft>=0.10.0 +pillow>=9.0.0 \ No newline at end of file