diff --git a/experiments/prompt-tuning/hc+ft/cardd/Inference.py b/experiments/prompt-tuning/hc+ft/cardd/Inference.py index 422c3e2..8710be1 100644 --- a/experiments/prompt-tuning/hc+ft/cardd/Inference.py +++ b/experiments/prompt-tuning/hc+ft/cardd/Inference.py @@ -33,8 +33,9 @@ def log_metrics_to_excel( inference_times: List[float], vram_usage: List[float], cosine_scores: List[float], + cider_scores: List[float], spice_scores: List[float], - flickr_subset, + test_subset, output_excel_path: str = "Flickr_pixtral.xlsx", prompts: str = None, wandb_project: str = "flickr-eval" @@ -43,14 +44,16 @@ def log_metrics_to_excel( pil_images = [] # keep images for wandb and excel insertion n = len(samples) for i, s in enumerate(samples): - pred = results[i] if i < len(results) else None - time_taken = inference_times[i] if i < len(inference_times) else None - vram = vram_usage[i] if i < len(vram_usage) else None - cos = cosine_scores[i] if i < len(cosine_scores) else None - spice = spice_scores[i] if i < len(spice_scores) else None + #pred = results[i] if i < len(results) else None + pred = results.get(s, None) + time_taken = inference_times.get(s, None) + vram = vram_usage.get(s, None) + cos = cosine_scores.get(s, None) + spice = spice_scores.get(s, None) + cider = cider_scores[s] if s < len(cider_scores) else None # sample lookup - sample_item = flickr_subset[s] + sample_item = test_subset[s] pil_img = sample_item['image'] if not isinstance(pil_img, PILImage.Image): pil_img = PILImage.fromarray(pil_img) @@ -66,6 +69,7 @@ def log_metrics_to_excel( "prediction": pred, "cosine_score": cos, "spice_score": spice, + "cider_score": cider, "vram_usage": vram, "inference_time_s": time_taken, } @@ -97,7 +101,7 @@ def log_metrics_to_excel( except Exception as e: print(f"[warning] wandb.init failed: {e}. Skipping wandb logging.") return df, None - table_cols = ["sample_index", "image", "prediction", "cosine_score", "spice_score", "vram_usage", "inference_time_s", "prompt"] + table_cols = ["sample_index", "image", "prediction", "cosine_score", "cider_score", "spice_score", "vram_usage", "inference_time_s", "prompt"] wandb_table = wandb.Table(columns=table_cols) for i, row in enumerate(rows): pil_img = pil_images[i] @@ -112,6 +116,7 @@ def log_metrics_to_excel( wb_image, row["prediction"], row["cosine_score"], + row["cider_score"], row["spice_score"], row["vram_usage"], row["inference_time_s"], @@ -134,6 +139,8 @@ def log_metrics_to_excel( parser = argparse.ArgumentParser(description="Run inference on a vision-language model") parser.add_argument("--prompt", type=str, default="Explain the image content step by step.", help="Prompt for the model") parser.add_argument("--model-name", type=str, default="Pixtral-12B", help="Model name to evaluate (e.g. Pixtral-12B)") + parser.add_argument("--base-model", type=str, default="unsloth/Qwen2-VL-7B-Instruct", help="Base model name (Hugging Face repo)") + parser.add_argument("--pickle-path", type=str, default="/workspace/cardd-df.p", help="Path to the pickle file for CIDEr evaluation") parser.add_argument("--dataset-folder", type=str, default="/workspace/filtered_dataset", help="Fallback image folder (if dataset items are paths)") parser.add_argument("--wandb-project", type=str, default="flickr-eval", help="WandB project name") parser.add_argument("--output-excel", type=str, default="Flickr_pixtral.xlsx", help="Output Excel file path") @@ -165,21 +172,23 @@ def log_metrics_to_excel( print("🔄 Loading Flickr subset dataset...") try: - flickr_subset = load_from_disk(dataset_folder) - print("✅ Dataset loaded. Number of samples:", len(flickr_subset)) + test_subset = load_from_disk(dataset_folder) + print("✅ Dataset loaded. Number of samples:", len(test_subset)) except Exception as e: print(f"[warning] Could not load dataset via load_from_disk({dataset_folder}): {e}") # fallback: try to treat dataset_folder as a directory of images - flickr_subset = [] - print("⚠️ flickr_subset is empty; images will be looked up from --img-folder by index when possible") + test_subset = [] + print("⚠️ test_subset is empty; images will be looked up from --img-folder by index when possible") print(f"🚀 Running evaluation batch with model {model_name}...") - results, cosine_scores, spice_scores, inference_times, vram_usage = evaluate_batch( + results, cosine_scores, cider_scores, spice_scores, inference_times, vram_usage = evaluate_batch( prompt, - flickr_subset, + test_subset, samples, multiple_refs, MODEL_DIR=model_dir, + BASE_MODEL = args.base_model, + PICKLE_PATH = args.pickle_path, LOAD_FROM_HF=args.load_from_hf ) @@ -187,6 +196,7 @@ def log_metrics_to_excel( print("✅ Evaluation complete!") print("📊 Results summary:") print("Cosine scores:", cosine_scores) + print("CIDEr scores:", cider_scores) print("SPICE scores:", spice_scores) print("Inference times:", inference_times) print("VRAM usage:", vram_usage) @@ -200,8 +210,9 @@ def log_metrics_to_excel( inference_times, vram_usage, cosine_scores, + cider_scores, spice_scores, - flickr_subset, + test_subset, output_excel_path=excel_path, prompts=prompt, wandb_project=wandb_project diff --git a/experiments/prompt-tuning/hc+ft/cardd/cardd-df.p b/experiments/prompt-tuning/hc+ft/cardd/cardd-df.p new file mode 100644 index 0000000..a9a1160 Binary files /dev/null and b/experiments/prompt-tuning/hc+ft/cardd/cardd-df.p differ diff --git a/experiments/prompt-tuning/hc+ft/cardd/cardd_qwen.sh b/experiments/prompt-tuning/hc+ft/cardd/cardd_qwen.sh index 5e63a44..50ced63 100644 --- a/experiments/prompt-tuning/hc+ft/cardd/cardd_qwen.sh +++ b/experiments/prompt-tuning/hc+ft/cardd/cardd_qwen.sh @@ -14,7 +14,7 @@ RUN_SCRIPT="Inference.py" WANDB_PROJECT="cardd-eval" MODEL_NAME="unsloth/Qwen2-VL-7B-Instruct" SAMPLE_FOLDER="kaggle/working/cardd_sample_hf/train" -USE_HF_DOWNLOAD=false +USE_HF_DOWNLOAD=true HF_TOKEN="" # add your huggingface token here REPO_ID="" # add your huggingface repo id here @@ -63,7 +63,7 @@ for i in "${!PROMPTS[@]}"; do --wandb-project "$WANDB_PROJECT" \ --output-excel "$OUTPUT_XLS" \ --model-dir "$MODEL_DIR" \ - #--load-from-hf #remove this flag if not loading from HF + --load-from-hf #remove this flag if not loading from HF echo "✅ Done. Excel saved at: $OUTPUT_XLS" else echo "❗ $RUN_SCRIPT not found in cwd. If you don't have it, run your own eval script and pass --model-name or --model-path as $MODEL_ROOT" diff --git a/experiments/prompt-tuning/hc+ft/cardd/cider.py b/experiments/prompt-tuning/hc+ft/cardd/cider.py new file mode 100644 index 0000000..e0fbada --- /dev/null +++ b/experiments/prompt-tuning/hc+ft/cardd/cider.py @@ -0,0 +1,158 @@ +# Tsung-Yi Lin +# Ramakrishna Vedantam + +import copy +import pickle +from collections import defaultdict +import numpy as np +import math +import os + +def precook(s, n=4, out=False): + words = s.split() + counts = defaultdict(int) + for k in range(1, n + 1): + for i in range(len(words) - k + 1): + ngram = tuple(words[i:i + k]) + counts[ngram] += 1 + return counts + +def cook_refs(refs, n=4): + return [precook(ref, n) for ref in refs] + +def cook_test(test, n=4): + return precook(test, n, True) + +class CiderScorer(object): + def copy(self): + new = CiderScorer(n=self.n) + new.ctest = copy.copy(self.ctest) + new.crefs = copy.copy(self.crefs) + return new + + def __init__(self, test=None, refs=None, n=4, sigma=6.0): + self.n = n + self.sigma = sigma + self.crefs = [] + self.ctest = [] + self.document_frequency = defaultdict(float) + self.cook_append(test, refs) + self.ref_len = None + + def cook_append(self, test, refs): + if refs is not None: + self.crefs.append(cook_refs(refs)) + if test is not None: + self.ctest.append(cook_test(test)) + else: + self.ctest.append(None) + + def size(self): + assert len(self.crefs) == len(self.ctest), f"refs/test mismatch! {len(self.crefs)}<>{len(self.ctest)}" + return len(self.crefs) + + def __iadd__(self, other): + if isinstance(other, tuple): + self.cook_append(other[0], other[1]) + else: + self.ctest.extend(other.ctest) + self.crefs.extend(other.crefs) + return self + + def compute_doc_freq(self): + for refs in self.crefs: + for ngram in set([ngram for ref in refs for (ngram, _) in ref.items()]): + self.document_frequency[ngram] += 1 + + def compute_cider(self): + def counts2vec(cnts): + vec = [defaultdict(float) for _ in range(self.n)] + length = 0 + norm = [0.0 for _ in range(self.n)] + for (ngram, term_freq) in cnts.items(): + df = np.log(max(1.0, self.document_frequency.get(ngram, 0.0))) + n = len(ngram) - 1 + if n >= self.n: + continue + vec[n][ngram] = float(term_freq) * (self.ref_len - df) + norm[n] += pow(vec[n][ngram], 2) + if n == 1: + length += term_freq + norm = [np.sqrt(n) for n in norm] + return vec, norm, length + + def sim(vec_hyp, vec_ref, norm_hyp, norm_ref, length_hyp, length_ref): + val = np.array([0.0 for _ in range(self.n)]) + for n in range(self.n): + for (ngram, _) in vec_hyp[n].items(): + val[n] += vec_hyp[n][ngram] * vec_ref[n].get(ngram, 0.0) + if norm_hyp[n] != 0 and norm_ref[n] != 0: + val[n] /= (norm_hyp[n] * norm_ref[n]) + assert not math.isnan(val[n]) + return val + + self.ref_len = np.log(float(40504)) + + scores = [] + for test, refs in zip(self.ctest, self.crefs): + vec, norm, length = counts2vec(test) + score = np.array([0.0 for _ in range(self.n)]) + for ref in refs: + vec_ref, norm_ref, length_ref = counts2vec(ref) + score += sim(vec, vec_ref, norm, norm_ref, length, length_ref) + score_avg = np.mean(score) + score_avg /= len(refs) + score_avg *= 10.0 + scores.append(score_avg) + return scores + + def compute_score(self, df_mode, pfile_path, option=None, verbose=0): + with open(pfile_path, 'rb') as f: + self.document_frequency = pickle.load(f) + score = self.compute_cider() + return np.mean(np.array(score)), np.array(score) +class Cider: + """ + Main Class to compute the CIDEr metric + + """ + def __init__(self, n=4, df="coco-val-df"): + """ + Initialize the CIDEr scoring function + : param n (int): n-gram size + : param df (string): specifies where to get the IDF values from + takes values 'corpus', 'coco-train' + : return: None + """ + # set cider to sum over 1 to 4-grams + self._n = n + self._df = df + + def compute_score(self, gts, res, pfile_path): + """ + Main function to compute CIDEr score + : param gts (dict) : {image:tokenized reference sentence} + : param res (dict) : {image:tokenized candidate sentence} + : return: cider (float) : computed CIDEr score for the corpus + """ + + cider_scorer = CiderScorer(n=self._n) + + for res_id in res: + + hypo = res_id['caption'] + ref = gts[res_id['image_id']] + + # Sanity check. + assert(type(hypo) is list) + assert(len(hypo) == 1) + assert(type(ref) is list) + assert(len(ref) > 0) + cider_scorer += (hypo[0], ref) + + (score, scores) = cider_scorer.compute_score(self._df, pfile_path= pfile_path) + + return score, scores + + def method(self): + return "CIDEr" \ No newline at end of file diff --git a/experiments/prompt-tuning/hc+ft/cardd/utils.py b/experiments/prompt-tuning/hc+ft/cardd/utils.py index c5629bf..d93e106 100644 --- a/experiments/prompt-tuning/hc+ft/cardd/utils.py +++ b/experiments/prompt-tuning/hc+ft/cardd/utils.py @@ -6,11 +6,13 @@ from pycocoevalcap.spice.spice import Spice from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer import pandas as pd +from cider import Cider import time import torch import os -def get_similarity_score(reference_captions, generated_caption): + +def get_similarity_score(reference_captions, generated_caption, scorer): try: total_score = 0.0 for caption in reference_captions: @@ -25,6 +27,19 @@ def get_similarity_score(reference_captions, generated_caption): except Exception as e: return 0.0 +def evaluate_cider(hypos, refs, PICKLE_PATH): + gts = {str(i): refs[i] for i in refs} + + res = [{"image_id": str(i), "caption": hypos[i]} for i in hypos] + + # Evaluate + cider = Cider() + score, individual_scores = cider.compute_score(gts, res, PICKLE_PATH) + print(f"🎯 CIDEr score: {score:.4f}") + + return score, individual_scores + + def calculate_spice(gts, res, stanford_corenlp_home=None): """ Calculates SPICE score. @@ -121,14 +136,14 @@ def run_inference(image, model, tokenizer, instruction): # On error, return empty caption and zeros return "", 0.0, 0.0 -def evaluate_batch(prompt, val_data, indexes, multiple_refs=True, MODEL_DIR="/workspace/unsloth-finetune", LOAD_FROM_HF=False): +def evaluate_batch(prompt, val_data, indexes, multiple_refs=True, MODEL_DIR="/workspace/unsloth-finetune", BASE_MODEL = "unsloth/Qwen2-VL-7B-Instruct" ,PICKLE_PATH = "/workspace/cardd-df.p", LOAD_FROM_HF=False): """ prompts_list: list of instructions to evaluate val_data: DataFrame with ['image', 'caption'] columns, indexes: list of indexes to sample from val_data """ - print(f"🔄 Loading vision-language model from {MODEL_DIR}...") - BASE_MODEL = "unsloth/Qwen2-VL-7B-Instruct" + print(f"🔄 Loading vision-language model from {MODEL_DIR}...") + # --- Load model --- if LOAD_FROM_HF: print(f"🔄 Loading base model '{BASE_MODEL}'...") @@ -168,12 +183,12 @@ def evaluate_batch(prompt, val_data, indexes, multiple_refs=True, MODEL_DIR="/wo if multiple_refs: reference_list = sample['caption'] pred, inference_time, peak_vram = run_inference(sample['image'], model, tokenizer, prompt) - cos_score = get_similarity_score(reference_list, pred) + cos_score = get_similarity_score(reference_list, pred,scorer) else: reference_list = [sample['caption']] pred, inference_time, peak_vram = run_inference(sample['image'], model, tokenizer, prompt) - cos_score = get_similarity_score(reference_list, pred) + cos_score = get_similarity_score(reference_list, pred,scorer) all_results[index] = pred cosine_scores[index] = cos_score @@ -181,14 +196,25 @@ def evaluate_batch(prompt, val_data, indexes, multiple_refs=True, MODEL_DIR="/wo Vram_usages[index] = peak_vram gts = {} res = {} - for i in range(len(indexes)): - gts[str(i)] = [{"caption": ref} for ref in reference_list] - res[str(i)] = [{"caption": all_results[indexes[i]]}] + for j, idx in enumerate(indexes): + sample = val_data[idx] + refs = sample['caption'] if multiple_refs else [sample['caption']] + gts[str(j)] = [{"caption": ref} for ref in refs] + res[str(j)] = [{"caption": all_results[idx]}] spice_score, spice_scores_per_instance = calculate_spice(gts, res) for i, idx in enumerate(indexes): Spice_scores[idx] = spice_scores_per_instance[i] if spice_scores_per_instance else 0.0 - + # Build dicts for CIDEr + hypos = {j: [all_results[idx]] for j, idx in enumerate(indexes)} # index → string + refs_dict = {j: sample['caption'] if multiple_refs else [sample['caption']] + for j, idx in enumerate(indexes) + for sample in [val_data[idx]]} # index → list of strings + + # Call CIDEr evaluation + score, cider_scores = evaluate_cider(hypos, refs_dict, PICKLE_PATH) + + print("✅ Batch evaluation complete!") - return all_results,cosine_scores, Spice_scores, Inference_time, Vram_usages + return all_results,cosine_scores, cider_scores,Spice_scores, Inference_time, Vram_usages