Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
89 changes: 89 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
# Python
__pycache__/
*.py[cod]
*.pyo
*.pyd
.Python
*.egg-info/
dist/
build/
*.whl

# Virtual environment
venv/
.env/
env/
.venv/

# IDE and editor files
.vscode/
.idea/
*.sublime-project
*.sublime-workspace

# OS-specific files
.DS_Store
Thumbs.db

# Hydra outputs
outputs/
multirun/
experiments/finetune_vlm/logs/

# Weights & Biases (W&B) logs
wandb/
wandb/*
run-*.wandb
wandb/run-*

# Model checkpoints and large files
experiments/finetune_vlm/qwen2_0_5b_lora/
*.safetensors
*.bin
*.pt
*.pth
model.safetensors
checkpoints/

# Datasets
data/
*.zip
*.tar.gz
*.gz
*.tgz

# Temporary files
*.tmp
*.temp
*.bak
*.swp
*~

# Logs
*.log
logs/

# Sensitive files
*.env
*.secret
secrets.yaml
config/secrets/

# Jupyter notebooks checkpoints
.ipynb_checkpoints/

# Coverage and testing
.coverage
coverage.xml
*.cover
*.pytest_cache/
htmlcov/

# Documentation build
docs/_build/
docs/build/

# Miscellaneous
*.cache
.cache/
*.prof
16 changes: 16 additions & 0 deletions configs/config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
defaults:
- _self_
- training
- model: qwen2_0_5b
- dataset: coco_tiny
wandb:
project: vlm_finetune_experiments
team: JAC-VISION



experiment: null

hydra:
job:
chdir: False
4 changes: 4 additions & 0 deletions configs/dataset/coco_tiny.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
dataset:
name: coco_tiny
size: 100
type: image_text
5 changes: 5 additions & 0 deletions configs/model/qwen2_0_5b.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
model:
name: qwen2_0_5b
pretrained_path: Qwen/Qwen2-0.5B
vision_model_path: openai/clip-vit-base-patch32
size: 0.5B
7 changes: 7 additions & 0 deletions configs/sweep/vlm_sweep.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
sweep:
lr:
- 1e-5
- 5e-5
batch_size:
- 1
- 2
4 changes: 4 additions & 0 deletions configs/training.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
training:
epochs: 3
lr: 1e-5
batch_size: 1
26 changes: 26 additions & 0 deletions core/runner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
import hydra
from omegaconf import DictConfig
import yaml
import importlib
import os

def load_registry(registry_path: str):
# Use absolute path relative to runner.py
abs_path = os.path.join(os.path.dirname(__file__), '..', 'experiment_registry.yaml')
with open(abs_path, 'r') as f:
registry = yaml.safe_load(f)
return registry

def run_experiment(cfg: DictConfig):
registry = load_registry('experiment_registry.yaml')
experiment_name = cfg.experiment

for exp in registry['experiments']:
if exp['name'] == experiment_name and exp['active']:
module_path = exp['module']
module_name, func_name = module_path.rsplit('.', 1)
module = importlib.import_module(module_name)
func = getattr(module, func_name)
return func(cfg)

raise ValueError(f"Experiment '{experiment_name}' not found or inactive in registry")
5 changes: 5 additions & 0 deletions experiment_registry.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,8 @@ experiments:
description: "Vary training image count to study performance scaling"
module: data_impact
active: true

- name: finetune_vlm
description: "Fine-tune Qwen2-0.5B with CLIP-ViT for VLM tasks using LoRA"
module: experiments.finetune_vlm.run
active: true
4 changes: 4 additions & 0 deletions experiments/finetune_vlm/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
from .run import run

# This file makes experiments.finetune_vlm a Python package
# Explicitly importing run ensures experiments.finetune_vlm.run is accessible
12 changes: 12 additions & 0 deletions experiments/finetune_vlm/config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
defaults:
- _self_
- /model: qwen2_0_5b
- /dataset: coco_tiny
- /training

wandb:
project: vlm_finetune_experiments
group: finetune_vlm
team: JAC-VISION

output_dir: experiments/finetune_vlm/qwen2_0_5b_lora
137 changes: 137 additions & 0 deletions experiments/finetune_vlm/logic.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
import torch
import torch.nn as nn
from omegaconf import DictConfig
import wandb
from transformers import AutoModelForCausalLM, AutoTokenizer, CLIPVisionModel, CLIPProcessor
from peft import LoraConfig, get_peft_model
from torch.utils.data import Dataset, DataLoader
import numpy as np
from PIL import Image
import io
import os

class DummyCOCODataset(Dataset):
def __init__(self, size=100):
self.size = size
# Synthetic data: dummy images and captions
self.images = [Image.new('RGB', (64, 64), color='gray') for _ in range(size)]
self.captions = [f"Object {i}" for i in range(size)]

def __len__(self):
return self.size

def __getitem__(self, idx):
return {'image': self.images[idx], 'text': self.captions[idx]}

def custom_collate_fn(batch):
# Collate images (PIL.Image.Image) and texts (strings) into lists
images = [item['image'] for item in batch]
texts = [item['text'] for item in batch]
return {'image': images, 'text': texts}

class VLM(nn.Module):
def __init__(self, language_model, vision_model):
super().__init__()
self.language_model = language_model
self.vision_model = vision_model
self.projection = nn.Linear(768, language_model.config.hidden_size, dtype=torch.float16) # CLIP to Qwen, match float16

def forward(self, input_ids, attention_mask, pixel_values, labels=None):
# Encode image
vision_outputs = self.vision_model(pixel_values=pixel_values)
image_hidden_states = vision_outputs.pooler_output
image_embeds = self.projection(image_hidden_states)

# Combine with text (simplified: pass text only to language model)
inputs = {'input_ids': input_ids, 'attention_mask': attention_mask}
outputs = self.language_model(**inputs, labels=labels)
return outputs

class VLMFinetuneExperiment:
def __init__(self, cfg: DictConfig):
self.cfg = cfg
self.device = torch.device("cpu") # Force CPU for Intel Iris

# Load models
self.language_model = AutoModelForCausalLM.from_pretrained(
cfg.model.model.pretrained_path,
torch_dtype=torch.float16,
low_cpu_mem_usage=True
)
self.tokenizer = AutoTokenizer.from_pretrained(cfg.model.model.pretrained_path)
self.vision_model = CLIPVisionModel.from_pretrained(
cfg.model.model.vision_model_path,
torch_dtype=torch.float16
).to(self.device)
self.processor = CLIPProcessor.from_pretrained(cfg.model.model.vision_model_path)

# Combine into VLM
self.model = VLM(self.language_model, self.vision_model)

# Apply LoRA to language model only
lora_config = LoraConfig(
r=8,
lora_alpha=16,
target_modules=["q_proj", "v_proj"],
lora_dropout=0.1,
bias="none"
)
self.model.language_model = get_peft_model(self.model.language_model, lora_config)
self.model.to(self.device)

# Dataset and loader
self.dataset = DummyCOCODataset(size=cfg.dataset.dataset.size)
self.dataloader = DataLoader(
self.dataset,
batch_size=cfg.training.batch_size,
shuffle=True,
collate_fn=custom_collate_fn
)

# Optimizer
self.optimizer = torch.optim.AdamW(self.model.parameters(), lr=cfg.training.lr)

def train(self):
self.model.train()
for epoch in range(1, self.cfg.training.epochs + 1):
total_loss = 0
correct = 0
total = 0

for batch in self.dataloader:
images = batch['image']
texts = batch['text']

# Process inputs
text_inputs = self.tokenizer(texts, return_tensors="pt", padding=True, truncation=True).to(self.device)
image_inputs = self.processor(images=images, return_tensors="pt").to(self.device)

# Forward pass
outputs = self.model(
input_ids=text_inputs.input_ids,
attention_mask=text_inputs.attention_mask,
pixel_values=image_inputs.pixel_values,
labels=text_inputs.input_ids
)
loss = outputs.loss

# Backward pass
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()

total_loss += loss.item()

# Simplified accuracy
preds = torch.argmax(outputs.logits, dim=-1)
correct += (preds == text_inputs.input_ids).float().mean().item()
total += 1

avg_loss = total_loss / len(self.dataloader)
accuracy = correct / total if total > 0 else 0
wandb.log({"epoch": epoch, "loss": avg_loss, "accuracy": accuracy})
print(f"Epoch {epoch}, Loss: {avg_loss:.4f}, Accuracy: {accuracy:.4f}")

# Save checkpoint
os.makedirs(self.cfg.output_dir, exist_ok=True)
self.model.language_model.save_pretrained(f"{self.cfg.output_dir}/epoch_{epoch}")
9 changes: 9 additions & 0 deletions experiments/finetune_vlm/run.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from omegaconf import DictConfig
from .logic import VLMFinetuneExperiment
import wandb

def run(cfg: DictConfig):
wandb.init(project=cfg.wandb.project, config=dict(cfg), group=cfg.experiment, entity=cfg.wandb.team)
experiment = VLMFinetuneExperiment(cfg)
experiment.train()
wandb.finish()
6 changes: 3 additions & 3 deletions main.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
import hydra
from omegaconf import DictConfig
from runner import run_experiment
from core.runner import run_experiment

@hydra.main(config_path="configs", config_name="config", version_base="1.3")
@hydra.main(config_path="configs", config_name="config", version_base="1.1")
def main(cfg: DictConfig):
run_experiment(cfg)

if __name__ == "__main__":
main()
main()
8 changes: 8 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
torch>=2.0.0
hydra-core>=1.3.0
omegaconf>=2.3.0
wandb>=0.15.0
numpy>=1.24.0
transformers>=4.36.0
peft>=0.10.0
pillow>=9.0.0