Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 26 additions & 15 deletions experiments/prompt-tuning/hc+ft/cardd/Inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,9 @@ def log_metrics_to_excel(
inference_times: List[float],
vram_usage: List[float],
cosine_scores: List[float],
cider_scores: List[float],
spice_scores: List[float],
flickr_subset,
test_subset,
output_excel_path: str = "Flickr_pixtral.xlsx",
prompts: str = None,
wandb_project: str = "flickr-eval"
Expand All @@ -43,14 +44,16 @@ def log_metrics_to_excel(
pil_images = [] # keep images for wandb and excel insertion
n = len(samples)
for i, s in enumerate(samples):
pred = results[i] if i < len(results) else None
time_taken = inference_times[i] if i < len(inference_times) else None
vram = vram_usage[i] if i < len(vram_usage) else None
cos = cosine_scores[i] if i < len(cosine_scores) else None
spice = spice_scores[i] if i < len(spice_scores) else None
#pred = results[i] if i < len(results) else None
pred = results.get(s, None)
time_taken = inference_times.get(s, None)
vram = vram_usage.get(s, None)
cos = cosine_scores.get(s, None)
spice = spice_scores.get(s, None)
cider = cider_scores[s] if s < len(cider_scores) else None

# sample lookup
sample_item = flickr_subset[s]
sample_item = test_subset[s]
pil_img = sample_item['image']
if not isinstance(pil_img, PILImage.Image):
pil_img = PILImage.fromarray(pil_img)
Expand All @@ -66,6 +69,7 @@ def log_metrics_to_excel(
"prediction": pred,
"cosine_score": cos,
"spice_score": spice,
"cider_score": cider,
"vram_usage": vram,
"inference_time_s": time_taken,
}
Expand Down Expand Up @@ -97,7 +101,7 @@ def log_metrics_to_excel(
except Exception as e:
print(f"[warning] wandb.init failed: {e}. Skipping wandb logging.")
return df, None
table_cols = ["sample_index", "image", "prediction", "cosine_score", "spice_score", "vram_usage", "inference_time_s", "prompt"]
table_cols = ["sample_index", "image", "prediction", "cosine_score", "cider_score", "spice_score", "vram_usage", "inference_time_s", "prompt"]
wandb_table = wandb.Table(columns=table_cols)
for i, row in enumerate(rows):
pil_img = pil_images[i]
Expand All @@ -112,6 +116,7 @@ def log_metrics_to_excel(
wb_image,
row["prediction"],
row["cosine_score"],
row["cider_score"],
row["spice_score"],
row["vram_usage"],
row["inference_time_s"],
Expand All @@ -134,6 +139,8 @@ def log_metrics_to_excel(
parser = argparse.ArgumentParser(description="Run inference on a vision-language model")
parser.add_argument("--prompt", type=str, default="Explain the image content step by step.", help="Prompt for the model")
parser.add_argument("--model-name", type=str, default="Pixtral-12B", help="Model name to evaluate (e.g. Pixtral-12B)")
parser.add_argument("--base-model", type=str, default="unsloth/Qwen2-VL-7B-Instruct", help="Base model name (Hugging Face repo)")
parser.add_argument("--pickle-path", type=str, default="/workspace/cardd-df.p", help="Path to the pickle file for CIDEr evaluation")
parser.add_argument("--dataset-folder", type=str, default="/workspace/filtered_dataset", help="Fallback image folder (if dataset items are paths)")
parser.add_argument("--wandb-project", type=str, default="flickr-eval", help="WandB project name")
parser.add_argument("--output-excel", type=str, default="Flickr_pixtral.xlsx", help="Output Excel file path")
Expand Down Expand Up @@ -165,28 +172,31 @@ def log_metrics_to_excel(

print("🔄 Loading Flickr subset dataset...")
try:
flickr_subset = load_from_disk(dataset_folder)
print("✅ Dataset loaded. Number of samples:", len(flickr_subset))
test_subset = load_from_disk(dataset_folder)
print("✅ Dataset loaded. Number of samples:", len(test_subset))
except Exception as e:
print(f"[warning] Could not load dataset via load_from_disk({dataset_folder}): {e}")
# fallback: try to treat dataset_folder as a directory of images
flickr_subset = []
print("⚠️ flickr_subset is empty; images will be looked up from --img-folder by index when possible")
test_subset = []
print("⚠️ test_subset is empty; images will be looked up from --img-folder by index when possible")

print(f"🚀 Running evaluation batch with model {model_name}...")
results, cosine_scores, spice_scores, inference_times, vram_usage = evaluate_batch(
results, cosine_scores, cider_scores, spice_scores, inference_times, vram_usage = evaluate_batch(
prompt,
flickr_subset,
test_subset,
samples,
multiple_refs,
MODEL_DIR=model_dir,
BASE_MODEL = args.base_model,
PICKLE_PATH = args.pickle_path,
LOAD_FROM_HF=args.load_from_hf
)


print("✅ Evaluation complete!")
print("📊 Results summary:")
print("Cosine scores:", cosine_scores)
print("CIDEr scores:", cider_scores)
print("SPICE scores:", spice_scores)
print("Inference times:", inference_times)
print("VRAM usage:", vram_usage)
Expand All @@ -200,8 +210,9 @@ def log_metrics_to_excel(
inference_times,
vram_usage,
cosine_scores,
cider_scores,
spice_scores,
flickr_subset,
test_subset,
output_excel_path=excel_path,
prompts=prompt,
wandb_project=wandb_project
Expand Down
Binary file added experiments/prompt-tuning/hc+ft/cardd/cardd-df.p
Binary file not shown.
4 changes: 2 additions & 2 deletions experiments/prompt-tuning/hc+ft/cardd/cardd_qwen.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ RUN_SCRIPT="Inference.py"
WANDB_PROJECT="cardd-eval"
MODEL_NAME="unsloth/Qwen2-VL-7B-Instruct"
SAMPLE_FOLDER="kaggle/working/cardd_sample_hf/train"
USE_HF_DOWNLOAD=false
USE_HF_DOWNLOAD=true

HF_TOKEN="" # add your huggingface token here
REPO_ID="" # add your huggingface repo id here
Expand Down Expand Up @@ -63,7 +63,7 @@ for i in "${!PROMPTS[@]}"; do
--wandb-project "$WANDB_PROJECT" \
--output-excel "$OUTPUT_XLS" \
--model-dir "$MODEL_DIR" \
#--load-from-hf #remove this flag if not loading from HF
--load-from-hf #remove this flag if not loading from HF
echo "✅ Done. Excel saved at: $OUTPUT_XLS"
else
echo "❗ $RUN_SCRIPT not found in cwd. If you don't have it, run your own eval script and pass --model-name or --model-path as $MODEL_ROOT"
Expand Down
158 changes: 158 additions & 0 deletions experiments/prompt-tuning/hc+ft/cardd/cider.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
# Tsung-Yi Lin <[email protected]>
# Ramakrishna Vedantam <[email protected]>

import copy
import pickle
from collections import defaultdict
import numpy as np
import math
import os

def precook(s, n=4, out=False):
words = s.split()
counts = defaultdict(int)
for k in range(1, n + 1):
for i in range(len(words) - k + 1):
ngram = tuple(words[i:i + k])
counts[ngram] += 1
return counts

def cook_refs(refs, n=4):
return [precook(ref, n) for ref in refs]

def cook_test(test, n=4):
return precook(test, n, True)

class CiderScorer(object):
def copy(self):
new = CiderScorer(n=self.n)
new.ctest = copy.copy(self.ctest)
new.crefs = copy.copy(self.crefs)
return new

def __init__(self, test=None, refs=None, n=4, sigma=6.0):
self.n = n
self.sigma = sigma
self.crefs = []
self.ctest = []
self.document_frequency = defaultdict(float)
self.cook_append(test, refs)
self.ref_len = None

def cook_append(self, test, refs):
if refs is not None:
self.crefs.append(cook_refs(refs))
if test is not None:
self.ctest.append(cook_test(test))
else:
self.ctest.append(None)

def size(self):
assert len(self.crefs) == len(self.ctest), f"refs/test mismatch! {len(self.crefs)}<>{len(self.ctest)}"
return len(self.crefs)

def __iadd__(self, other):
if isinstance(other, tuple):
self.cook_append(other[0], other[1])
else:
self.ctest.extend(other.ctest)
self.crefs.extend(other.crefs)
return self

def compute_doc_freq(self):
for refs in self.crefs:
for ngram in set([ngram for ref in refs for (ngram, _) in ref.items()]):
self.document_frequency[ngram] += 1

def compute_cider(self):
def counts2vec(cnts):
vec = [defaultdict(float) for _ in range(self.n)]
length = 0
norm = [0.0 for _ in range(self.n)]
for (ngram, term_freq) in cnts.items():
df = np.log(max(1.0, self.document_frequency.get(ngram, 0.0)))
n = len(ngram) - 1
if n >= self.n:
continue
vec[n][ngram] = float(term_freq) * (self.ref_len - df)
norm[n] += pow(vec[n][ngram], 2)
if n == 1:
length += term_freq
norm = [np.sqrt(n) for n in norm]
return vec, norm, length

def sim(vec_hyp, vec_ref, norm_hyp, norm_ref, length_hyp, length_ref):
val = np.array([0.0 for _ in range(self.n)])
for n in range(self.n):
for (ngram, _) in vec_hyp[n].items():
val[n] += vec_hyp[n][ngram] * vec_ref[n].get(ngram, 0.0)
if norm_hyp[n] != 0 and norm_ref[n] != 0:
val[n] /= (norm_hyp[n] * norm_ref[n])
assert not math.isnan(val[n])
return val

self.ref_len = np.log(float(40504))

scores = []
for test, refs in zip(self.ctest, self.crefs):
vec, norm, length = counts2vec(test)
score = np.array([0.0 for _ in range(self.n)])
for ref in refs:
vec_ref, norm_ref, length_ref = counts2vec(ref)
score += sim(vec, vec_ref, norm, norm_ref, length, length_ref)
score_avg = np.mean(score)
score_avg /= len(refs)
score_avg *= 10.0
scores.append(score_avg)
return scores

def compute_score(self, df_mode, pfile_path, option=None, verbose=0):
with open(pfile_path, 'rb') as f:
self.document_frequency = pickle.load(f)
score = self.compute_cider()
return np.mean(np.array(score)), np.array(score)
class Cider:
"""
Main Class to compute the CIDEr metric

"""
def __init__(self, n=4, df="coco-val-df"):
"""
Initialize the CIDEr scoring function
: param n (int): n-gram size
: param df (string): specifies where to get the IDF values from
takes values 'corpus', 'coco-train'
: return: None
"""
# set cider to sum over 1 to 4-grams
self._n = n
self._df = df

def compute_score(self, gts, res, pfile_path):
"""
Main function to compute CIDEr score
: param gts (dict) : {image:tokenized reference sentence}
: param res (dict) : {image:tokenized candidate sentence}
: return: cider (float) : computed CIDEr score for the corpus
"""

cider_scorer = CiderScorer(n=self._n)

for res_id in res:

hypo = res_id['caption']
ref = gts[res_id['image_id']]

# Sanity check.
assert(type(hypo) is list)
assert(len(hypo) == 1)
assert(type(ref) is list)
assert(len(ref) > 0)
cider_scorer += (hypo[0], ref)

(score, scores) = cider_scorer.compute_score(self._df, pfile_path= pfile_path)

return score, scores

def method(self):
return "CIDEr"
48 changes: 37 additions & 11 deletions experiments/prompt-tuning/hc+ft/cardd/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,13 @@
from pycocoevalcap.spice.spice import Spice
from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer
import pandas as pd
from cider import Cider
import time
import torch
import os

def get_similarity_score(reference_captions, generated_caption):

def get_similarity_score(reference_captions, generated_caption, scorer):
try:
total_score = 0.0
for caption in reference_captions:
Expand All @@ -25,6 +27,19 @@ def get_similarity_score(reference_captions, generated_caption):
except Exception as e:
return 0.0

def evaluate_cider(hypos, refs, PICKLE_PATH):
gts = {str(i): refs[i] for i in refs}

res = [{"image_id": str(i), "caption": hypos[i]} for i in hypos]

# Evaluate
cider = Cider()
score, individual_scores = cider.compute_score(gts, res, PICKLE_PATH)
print(f"🎯 CIDEr score: {score:.4f}")

return score, individual_scores


def calculate_spice(gts, res, stanford_corenlp_home=None):
"""
Calculates SPICE score.
Expand Down Expand Up @@ -121,14 +136,14 @@ def run_inference(image, model, tokenizer, instruction):
# On error, return empty caption and zeros
return "", 0.0, 0.0

def evaluate_batch(prompt, val_data, indexes, multiple_refs=True, MODEL_DIR="/workspace/unsloth-finetune", LOAD_FROM_HF=False):
def evaluate_batch(prompt, val_data, indexes, multiple_refs=True, MODEL_DIR="/workspace/unsloth-finetune", BASE_MODEL = "unsloth/Qwen2-VL-7B-Instruct" ,PICKLE_PATH = "/workspace/cardd-df.p", LOAD_FROM_HF=False):
"""
prompts_list: list of instructions to evaluate
val_data: DataFrame with ['image', 'caption'] columns,
indexes: list of indexes to sample from val_data
"""
print(f"🔄 Loading vision-language model from {MODEL_DIR}...")
BASE_MODEL = "unsloth/Qwen2-VL-7B-Instruct"
print(f"🔄 Loading vision-language model from {MODEL_DIR}...")

# --- Load model ---
if LOAD_FROM_HF:
print(f"🔄 Loading base model '{BASE_MODEL}'...")
Expand Down Expand Up @@ -168,27 +183,38 @@ def evaluate_batch(prompt, val_data, indexes, multiple_refs=True, MODEL_DIR="/wo
if multiple_refs:
reference_list = sample['caption']
pred, inference_time, peak_vram = run_inference(sample['image'], model, tokenizer, prompt)
cos_score = get_similarity_score(reference_list, pred)
cos_score = get_similarity_score(reference_list, pred,scorer)

else:
reference_list = [sample['caption']]
pred, inference_time, peak_vram = run_inference(sample['image'], model, tokenizer, prompt)
cos_score = get_similarity_score(reference_list, pred)
cos_score = get_similarity_score(reference_list, pred,scorer)

all_results[index] = pred
cosine_scores[index] = cos_score
Inference_time[index] = inference_time
Vram_usages[index] = peak_vram
gts = {}
res = {}
for i in range(len(indexes)):
gts[str(i)] = [{"caption": ref} for ref in reference_list]
res[str(i)] = [{"caption": all_results[indexes[i]]}]
for j, idx in enumerate(indexes):
sample = val_data[idx]
refs = sample['caption'] if multiple_refs else [sample['caption']]
gts[str(j)] = [{"caption": ref} for ref in refs]
res[str(j)] = [{"caption": all_results[idx]}]
spice_score, spice_scores_per_instance = calculate_spice(gts, res)
for i, idx in enumerate(indexes):
Spice_scores[idx] = spice_scores_per_instance[i] if spice_scores_per_instance else 0.0


# Build dicts for CIDEr
hypos = {j: [all_results[idx]] for j, idx in enumerate(indexes)} # index → string
refs_dict = {j: sample['caption'] if multiple_refs else [sample['caption']]
for j, idx in enumerate(indexes)
for sample in [val_data[idx]]} # index → list of strings

# Call CIDEr evaluation
score, cider_scores = evaluate_cider(hypos, refs_dict, PICKLE_PATH)


print("✅ Batch evaluation complete!")
return all_results,cosine_scores, Spice_scores, Inference_time, Vram_usages
return all_results,cosine_scores, cider_scores,Spice_scores, Inference_time, Vram_usages