jaseci-labs · ushariRanasinghe · Sep 25, 2025
diff --git a/experiments/prompt-tuning/hc+ft/cardd/Inference.py b/experiments/prompt-tuning/hc+ft/cardd/Inference.py
@@ -33,8 +33,9 @@ def log_metrics_to_excel(
     inference_times: List[float],
     vram_usage: List[float],
     cosine_scores: List[float],
+    cider_scores: List[float],
     spice_scores: List[float],
-    flickr_subset,
+    test_subset,
     output_excel_path: str = "Flickr_pixtral.xlsx",
     prompts: str = None,
     wandb_project: str = "flickr-eval"
@@ -43,14 +44,16 @@ def log_metrics_to_excel(
     pil_images = []  # keep images for wandb and excel insertion
     n = len(samples)
     for i, s in enumerate(samples):
-        pred = results[i] if i < len(results) else None
-        time_taken = inference_times[i] if i < len(inference_times) else None
-        vram = vram_usage[i] if i < len(vram_usage) else None
-        cos = cosine_scores[i] if i < len(cosine_scores) else None
-        spice = spice_scores[i] if i < len(spice_scores) else None
+        #pred = results[i] if i < len(results) else None
+        pred = results.get(s, None)
+        time_taken = inference_times.get(s, None)
+        vram       = vram_usage.get(s, None)
+        cos        = cosine_scores.get(s, None)
+        spice      = spice_scores.get(s, None)
+        cider = cider_scores[s] if s < len(cider_scores) else None
 
         # sample lookup
-        sample_item = flickr_subset[s]
+        sample_item = test_subset[s]
         pil_img = sample_item['image']
         if not isinstance(pil_img, PILImage.Image):
             pil_img = PILImage.fromarray(pil_img)
@@ -66,6 +69,7 @@ def log_metrics_to_excel(
             "prediction": pred,
             "cosine_score": cos,
             "spice_score": spice,
+            "cider_score": cider,
             "vram_usage": vram,
             "inference_time_s": time_taken,
         }
@@ -97,7 +101,7 @@ def log_metrics_to_excel(
     except Exception as e:
         print(f"[warning] wandb.init failed: {e}. Skipping wandb logging.")
         return df, None
-    table_cols = ["sample_index", "image", "prediction", "cosine_score", "spice_score", "vram_usage", "inference_time_s", "prompt"]
+    table_cols = ["sample_index", "image", "prediction", "cosine_score", "cider_score", "spice_score", "vram_usage", "inference_time_s", "prompt"]
     wandb_table = wandb.Table(columns=table_cols)
     for i, row in enumerate(rows):
         pil_img = pil_images[i]
@@ -112,6 +116,7 @@ def log_metrics_to_excel(
             wb_image,
             row["prediction"],
             row["cosine_score"],
+            row["cider_score"],
             row["spice_score"],
             row["vram_usage"],
             row["inference_time_s"],
@@ -134,6 +139,8 @@ def log_metrics_to_excel(
     parser = argparse.ArgumentParser(description="Run inference on a vision-language model")
     parser.add_argument("--prompt", type=str, default="Explain the image content step by step.", help="Prompt for the model")
     parser.add_argument("--model-name", type=str, default="Pixtral-12B", help="Model name to evaluate (e.g. Pixtral-12B)")
+    parser.add_argument("--base-model", type=str, default="unsloth/Qwen2-VL-7B-Instruct", help="Base model name (Hugging Face repo)")
+    parser.add_argument("--pickle-path", type=str, default="/workspace/cardd-df.p", help="Path to the pickle file for CIDEr evaluation")
     parser.add_argument("--dataset-folder", type=str, default="/workspace/filtered_dataset", help="Fallback image folder (if dataset items are paths)")
     parser.add_argument("--wandb-project", type=str, default="flickr-eval", help="WandB project name")
     parser.add_argument("--output-excel", type=str, default="Flickr_pixtral.xlsx", help="Output Excel file path")
@@ -165,28 +172,31 @@ def log_metrics_to_excel(
 
     print("🔄 Loading Flickr subset dataset...")
     try:
-        flickr_subset = load_from_disk(dataset_folder)
-        print("✅ Dataset loaded. Number of samples:", len(flickr_subset))
+        test_subset = load_from_disk(dataset_folder)
+        print("✅ Dataset loaded. Number of samples:", len(test_subset))
     except Exception as e:
         print(f"[warning] Could not load dataset via load_from_disk({dataset_folder}): {e}")
         # fallback: try to treat dataset_folder as a directory of images
-        flickr_subset = []
-        print("⚠️ flickr_subset is empty; images will be looked up from --img-folder by index when possible")
+        test_subset = []
+        print("⚠️ test_subset is empty; images will be looked up from --img-folder by index when possible")
 
     print(f"🚀 Running evaluation batch with model {model_name}...")
-    results, cosine_scores, spice_scores, inference_times, vram_usage = evaluate_batch(
+    results, cosine_scores, cider_scores, spice_scores, inference_times, vram_usage = evaluate_batch(
             prompt,
-            flickr_subset,
+            test_subset,
             samples,
             multiple_refs, 
             MODEL_DIR=model_dir,
+            BASE_MODEL = args.base_model, 
+            PICKLE_PATH = args.pickle_path,
             LOAD_FROM_HF=args.load_from_hf
         )
 
 
     print("✅ Evaluation complete!")
     print("📊 Results summary:")
     print("Cosine scores:", cosine_scores)
+    print("CIDEr scores:", cider_scores)
     print("SPICE scores:", spice_scores)
     print("Inference times:", inference_times)
     print("VRAM usage:", vram_usage)
@@ -200,8 +210,9 @@ def log_metrics_to_excel(
         inference_times,
         vram_usage,
         cosine_scores,
+        cider_scores,
         spice_scores,
-        flickr_subset,
+        test_subset,
         output_excel_path=excel_path,
         prompts=prompt,
         wandb_project=wandb_project

diff --git a/experiments/prompt-tuning/hc+ft/cardd/cardd-df.p b/experiments/prompt-tuning/hc+ft/cardd/cardd-df.p
diff --git a/experiments/prompt-tuning/hc+ft/cardd/cardd_qwen.sh b/experiments/prompt-tuning/hc+ft/cardd/cardd_qwen.sh
@@ -14,7 +14,7 @@ RUN_SCRIPT="Inference.py"
 WANDB_PROJECT="cardd-eval"
 MODEL_NAME="unsloth/Qwen2-VL-7B-Instruct"
 SAMPLE_FOLDER="kaggle/working/cardd_sample_hf/train"
-USE_HF_DOWNLOAD=false 
+USE_HF_DOWNLOAD=true 
 
 HF_TOKEN=""  # add your huggingface token here
 REPO_ID=""  # add your huggingface repo id here
@@ -63,7 +63,7 @@ for i in "${!PROMPTS[@]}"; do
             --wandb-project "$WANDB_PROJECT" \
             --output-excel "$OUTPUT_XLS" \
             --model-dir "$MODEL_DIR" \
-            #--load-from-hf #remove this flag if not loading from HF
+            --load-from-hf #remove this flag if not loading from HF
         echo "✅ Done. Excel saved at: $OUTPUT_XLS"
     else
         echo "❗ $RUN_SCRIPT not found in cwd. If you don't have it, run your own eval script and pass --model-name or --model-path as $MODEL_ROOT"

diff --git a/experiments/prompt-tuning/hc+ft/cardd/cider.py b/experiments/prompt-tuning/hc+ft/cardd/cider.py
@@ -0,0 +1,158 @@
+# Tsung-Yi Lin <[email protected]>
+# Ramakrishna Vedantam <[email protected]>
+
+import copy
+import pickle
+from collections import defaultdict
+import numpy as np
+import math
+import os
+
+def precook(s, n=4, out=False):
+    words = s.split()
+    counts = defaultdict(int)
+    for k in range(1, n + 1):
+        for i in range(len(words) - k + 1):
+            ngram = tuple(words[i:i + k])
+            counts[ngram] += 1
+    return counts
+
+def cook_refs(refs, n=4):
+    return [precook(ref, n) for ref in refs]
+
+def cook_test(test, n=4):
+    return precook(test, n, True)
+
+class CiderScorer(object):
+    def copy(self):
+        new = CiderScorer(n=self.n)
+        new.ctest = copy.copy(self.ctest)
+        new.crefs = copy.copy(self.crefs)
+        return new
+
+    def __init__(self, test=None, refs=None, n=4, sigma=6.0):
+        self.n = n
+        self.sigma = sigma
+        self.crefs = []
+        self.ctest = []
+        self.document_frequency = defaultdict(float)
+        self.cook_append(test, refs)
+        self.ref_len = None
+
+    def cook_append(self, test, refs):
+        if refs is not None:
+            self.crefs.append(cook_refs(refs))
+            if test is not None:
+                self.ctest.append(cook_test(test))
+            else:
+                self.ctest.append(None)
+
+    def size(self):
+        assert len(self.crefs) == len(self.ctest), f"refs/test mismatch! {len(self.crefs)}<>{len(self.ctest)}"
+        return len(self.crefs)
+
+    def __iadd__(self, other):
+        if isinstance(other, tuple):
+            self.cook_append(other[0], other[1])
+        else:
+            self.ctest.extend(other.ctest)
+            self.crefs.extend(other.crefs)
+        return self
+
+    def compute_doc_freq(self):
+        for refs in self.crefs:
+            for ngram in set([ngram for ref in refs for (ngram, _) in ref.items()]):
+                self.document_frequency[ngram] += 1
+
+    def compute_cider(self):
+        def counts2vec(cnts):
+            vec = [defaultdict(float) for _ in range(self.n)]
+            length = 0
+            norm = [0.0 for _ in range(self.n)]
+            for (ngram, term_freq) in cnts.items():
+                df = np.log(max(1.0, self.document_frequency.get(ngram, 0.0)))
+                n = len(ngram) - 1
+                if n >= self.n:
+                    continue
+                vec[n][ngram] = float(term_freq) * (self.ref_len - df)
+                norm[n] += pow(vec[n][ngram], 2)
+                if n == 1:
+                    length += term_freq
+            norm = [np.sqrt(n) for n in norm]
+            return vec, norm, length
+
+        def sim(vec_hyp, vec_ref, norm_hyp, norm_ref, length_hyp, length_ref):
+            val = np.array([0.0 for _ in range(self.n)])
+            for n in range(self.n):
+                for (ngram, _) in vec_hyp[n].items():
+                    val[n] += vec_hyp[n][ngram] * vec_ref[n].get(ngram, 0.0)
+                if norm_hyp[n] != 0 and norm_ref[n] != 0:
+                    val[n] /= (norm_hyp[n] * norm_ref[n])
+                assert not math.isnan(val[n])
+            return val
+
+        self.ref_len = np.log(float(40504))
+
+        scores = []
+        for test, refs in zip(self.ctest, self.crefs):
+            vec, norm, length = counts2vec(test)
+            score = np.array([0.0 for _ in range(self.n)])
+            for ref in refs:
+                vec_ref, norm_ref, length_ref = counts2vec(ref)
+                score += sim(vec, vec_ref, norm, norm_ref, length, length_ref)
+            score_avg = np.mean(score)
+            score_avg /= len(refs)
+            score_avg *= 10.0
+            scores.append(score_avg)
+        return scores
+
+    def compute_score(self, df_mode, pfile_path, option=None, verbose=0):
+        with open(pfile_path, 'rb') as f:
+            self.document_frequency = pickle.load(f)
+        score = self.compute_cider()
+        return np.mean(np.array(score)), np.array(score)
+class Cider:
+    """
+    Main Class to compute the CIDEr metric
+
+    """
+    def __init__(self, n=4, df="coco-val-df"):
+        """
+        Initialize the CIDEr scoring function
+        : param n (int): n-gram size
+        : param df (string): specifies where to get the IDF values from
+                    takes values 'corpus', 'coco-train'
+        : return: None
+        """
+        # set cider to sum over 1 to 4-grams
+        self._n = n
+        self._df = df
+
+    def compute_score(self, gts, res, pfile_path):
+        """
+        Main function to compute CIDEr score
+        : param  gts (dict) : {image:tokenized reference sentence}
+        : param res (dict)  : {image:tokenized candidate sentence}
+        : return: cider (float) : computed CIDEr score for the corpus
+        """
+
+        cider_scorer = CiderScorer(n=self._n)
+
+        for res_id in res:
+
+            hypo = res_id['caption']
+            ref = gts[res_id['image_id']]
+
+            # Sanity check.
+            assert(type(hypo) is list)
+            assert(len(hypo) == 1)
+            assert(type(ref) is list)
+            assert(len(ref) > 0)
+            cider_scorer += (hypo[0], ref)
+
+        (score, scores) = cider_scorer.compute_score(self._df, pfile_path= pfile_path)
+
+        return score, scores
+
+    def method(self):
+        return "CIDEr"
diff --git a/experiments/prompt-tuning/hc+ft/cardd/utils.py b/experiments/prompt-tuning/hc+ft/cardd/utils.py
@@ -6,11 +6,13 @@
 from pycocoevalcap.spice.spice   import Spice
 from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer
 import pandas as pd
+from cider import Cider
 import time
 import torch
 import os
 
-def get_similarity_score(reference_captions, generated_caption):
+
+def get_similarity_score(reference_captions, generated_caption, scorer):
     try:
         total_score = 0.0
         for caption in reference_captions:
@@ -25,6 +27,19 @@ def get_similarity_score(reference_captions, generated_caption):
     except Exception as e:
         return 0.0
 
+def evaluate_cider(hypos, refs, PICKLE_PATH):
+    gts = {str(i): refs[i] for i in refs}
+
+    res = [{"image_id": str(i), "caption": hypos[i]} for i in hypos]
+
+    # Evaluate
+    cider = Cider()
+    score, individual_scores = cider.compute_score(gts, res, PICKLE_PATH)
+    print(f"🎯 CIDEr score: {score:.4f}")
+
+    return score, individual_scores
+
+
 def  calculate_spice(gts, res, stanford_corenlp_home=None):
     """
     Calculates SPICE score.
@@ -121,14 +136,14 @@ def run_inference(image, model, tokenizer, instruction):
         # On error, return empty caption and zeros
         return "", 0.0, 0.0
 
-def evaluate_batch(prompt, val_data, indexes, multiple_refs=True, MODEL_DIR="/workspace/unsloth-finetune", LOAD_FROM_HF=False):
+def evaluate_batch(prompt, val_data, indexes, multiple_refs=True, MODEL_DIR="/workspace/unsloth-finetune", BASE_MODEL = "unsloth/Qwen2-VL-7B-Instruct" ,PICKLE_PATH = "/workspace/cardd-df.p", LOAD_FROM_HF=False):
     """
     prompts_list: list of instructions to evaluate
     val_data: DataFrame with ['image', 'caption'] columns,
     indexes: list of indexes to sample from val_data
     """
-    print(f"🔄 Loading vision-language model from {MODEL_DIR}...")
-    BASE_MODEL = "unsloth/Qwen2-VL-7B-Instruct"  
+    print(f"🔄 Loading vision-language model from {MODEL_DIR}...") 
+
     # --- Load model ---
     if LOAD_FROM_HF:
         print(f"🔄 Loading base model '{BASE_MODEL}'...")
@@ -168,27 +183,38 @@ def evaluate_batch(prompt, val_data, indexes, multiple_refs=True, MODEL_DIR="/wo
         if multiple_refs:
             reference_list = sample['caption'] 
             pred, inference_time, peak_vram = run_inference(sample['image'], model, tokenizer, prompt)
-            cos_score = get_similarity_score(reference_list, pred)
+            cos_score = get_similarity_score(reference_list, pred,scorer)
 
         else:
             reference_list = [sample['caption']]
             pred, inference_time, peak_vram = run_inference(sample['image'], model, tokenizer, prompt)
-            cos_score = get_similarity_score(reference_list, pred)
+            cos_score = get_similarity_score(reference_list, pred,scorer)
 
         all_results[index] = pred
         cosine_scores[index] = cos_score
         Inference_time[index] = inference_time
         Vram_usages[index] = peak_vram
     gts = {}
     res = {}
-    for i in range(len(indexes)):
-        gts[str(i)] = [{"caption": ref} for ref in reference_list]
-        res[str(i)] = [{"caption": all_results[indexes[i]]}]
+    for j, idx in enumerate(indexes):
+        sample = val_data[idx]
+        refs = sample['caption'] if multiple_refs else [sample['caption']]
+        gts[str(j)] = [{"caption": ref} for ref in refs]
+        res[str(j)] = [{"caption": all_results[idx]}]
     spice_score, spice_scores_per_instance = calculate_spice(gts, res)
     for i, idx in enumerate(indexes):
         Spice_scores[idx] = spice_scores_per_instance[i] if spice_scores_per_instance else 0.0
 
-
+    # Build dicts for CIDEr
+    hypos = {j: [all_results[idx]] for j, idx in enumerate(indexes)}  # index → string
+    refs_dict = {j: sample['caption'] if multiple_refs else [sample['caption']] 
+                 for j, idx in enumerate(indexes) 
+                 for sample in [val_data[idx]]}  # index → list of strings
+
+    # Call CIDEr evaluation
+    score, cider_scores = evaluate_cider(hypos, refs_dict, PICKLE_PATH)
+
+
     print("✅ Batch evaluation complete!")
-    return all_results,cosine_scores, Spice_scores, Inference_time, Vram_usages
+    return all_results,cosine_scores, cider_scores,Spice_scores, Inference_time, Vram_usages