XxFChen
diff --git a/Diff for: ‎chair.pkl
1.96 MB b/Diff for: ‎chair.pkl
1.96 MB
diff --git a/Diff for: ‎llava/eval/chair.py
+474 b/Diff for: ‎llava/eval/chair.py
+474
diff --git a/Diff for: ‎llava/eval/dense_image_caption.py
+92 b/Diff for: ‎llava/eval/dense_image_caption.py
+92
diff --git a/Diff for: ‎llava/eval/eval_pope.py
+5-3 b/Diff for: ‎llava/eval/eval_pope.py
+5-3
diff --git a/Diff for: ‎llava/eval/model_chair_loader.py
+112 b/Diff for: ‎llava/eval/model_chair_loader.py
+112
diff --git a/Diff for: ‎llava/eval/model_vqa_loader.py
+93-1 b/Diff for: ‎llava/eval/model_vqa_loader.py
+93-1
diff --git a/Diff for: ‎llava/eval/prepare_chair_questions.py
+36 b/Diff for: ‎llava/eval/prepare_chair_questions.py
+36
@@ -0,0 +1,92 @@
+import torch
+import os
+import json
+from PIL import Image
+from llava.constants import DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
+from llava.conversation import conv_templates
+from llava.model.builder import load_pretrained_model
+from llava.utils import disable_torch_init
+from llava.mm_utils import tokenizer_image_token, process_images
+
+
+def eval_single_image(model_path, image_path, question, conv_mode="llava_v1", temperature=0.2, top_p=None, num_beams=1, max_new_tokens=128):
+    # 禁用初始化，优化加载速度
+    disable_torch_init()
+
+    # 加载模型
+    print("Loading model and tokenizer...")
+    model_name = os.path.basename(model_path)
+    tokenizer, model, image_processor, context_len = load_pretrained_model(
+        model_path, None, model_name
+    )
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    model = model.to(device)
+
+    # 处理问题和对话模板
+    print("Preparing prompt...")
+    if hasattr(model.config, "mm_use_im_start_end") and model.config.mm_use_im_start_end:
+        prompt = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + question
+    else:
+        prompt = DEFAULT_IMAGE_TOKEN + '\n' + question
+
+    conv = conv_templates[conv_mode].copy()
+    conv.append_message(conv.roles[0], prompt)
+    conv.append_message(conv.roles[1], None)
+    final_prompt = conv.get_prompt()
+
+    # 处理图像
+    print("Processing image...")
+    image = Image.open(image_path).convert('RGB')
+    image_tensor = process_images([image], image_processor, model.config)[0]
+
+    # Tokenize 输入
+    input_ids = tokenizer_image_token(
+        final_prompt, tokenizer, return_tensors="pt"
+    )
+    input_ids = input_ids.to(device)
+    image_tensor = image_tensor.to(dtype=torch.float16, device=device)
+
+    # 推理
+    print("Running inference...")
+    with torch.no_grad():
+        output_ids = model.generate(
+            input_ids,
+            images=image_tensor.unsqueeze(0),  # 加入批量维度
+            max_new_tokens=max_new_tokens,
+            num_beams=num_beams,
+            temperature=temperature,
+            top_p=top_p,
+            use_cache=True,
+        )
+
+    # 解码生成的回答
+    answer = tokenizer.decode(output_ids[0], skip_special_tokens=True).strip()
+    return answer
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model-path", type=str, required=True, help="Path to the pre-trained model.")
+    parser.add_argument("--image-path", type=str, required=True, help="Path to the input image.")
+    parser.add_argument("--question", type=str, required=True, help="Question to ask about the image.")
+    parser.add_argument("--conv-mode", type=str, default="llava_v1", help="Conversation mode for the model.")
+    parser.add_argument("--temperature", type=float, default=0.2, help="Sampling temperature.")
+    parser.add_argument("--top_p", type=float, default=None, help="Nucleus sampling probability.")
+    parser.add_argument("--num_beams", type=int, default=1, help="Number of beams for beam search.")
+    parser.add_argument("--max-new-tokens", type=int, default=128, help="Maximum number of tokens to generate.")
+    args = parser.parse_args()
+
+    # 执行推理
+    result = eval_single_image(
+        model_path=args.model_path,
+        image_path=args.image_path,
+        question=args.question,
+        conv_mode=args.conv_mode,
+        temperature=args.temperature,
+        top_p=args.top_p,
+        num_beams=args.num_beams,
+        max_new_tokens=args.max_new_tokens,
+    )
+    print("\nGenerated Answer:", result)
@@ -6,15 +6,15 @@ def eval_pope(answers, label_file):
     label_list = [json.loads(q)['label'] for q in open(label_file, 'r')]
 
     for answer in answers:
-        text = answer['text']
+        text = answer['text'].lower()  # 转换为小写
 
         # Only keep the first sentence
         if text.find('.') != -1:
             text = text.split('.')[0]
 
         text = text.replace(',', '')
         words = text.split(' ')
-        if 'No' in words or 'not' in words or 'no' in words:
+        if 'no' in words or 'not' in words:  # 只需要检查小写形式
             answer['text'] = 'no'
         else:
             answer['text'] = 'yes'
@@ -70,6 +70,8 @@ def eval_pope(answers, label_file):
 
     questions = [json.loads(line) for line in open(args.question_file)]
     questions = {question['question_id']: question for question in questions}
+    
+    
     answers = [json.loads(q) for q in open(args.result_file)]
     for file in os.listdir(args.annotation_dir):
         assert file.startswith('coco_pope_')
@@ -78,4 +80,4 @@ def eval_pope(answers, label_file):
         cur_answers = [x for x in answers if questions[x['question_id']]['category'] == category]
         print('Category: {}, # samples: {}'.format(category, len(cur_answers)))
         eval_pope(cur_answers, os.path.join(args.annotation_dir, file))
-        print("====================================")
+        print("====================================")
@@ -0,0 +1,112 @@
+import argparse
+import torch
+import os
+import json
+from tqdm import tqdm
+import shortuuid
+
+from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
+from llava.conversation import conv_templates, SeparatorStyle
+from llava.model.builder import load_pretrained_model
+from llava.utils import disable_torch_init
+from llava.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path
+
+from PIL import Image
+import math
+
+
+def split_list(lst, n):
+    """Split a list into n (roughly) equal-sized chunks"""
+    chunk_size = math.ceil(len(lst) / n)  # integer division
+    return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
+
+
+def get_chunk(lst, n, k):
+    chunks = split_list(lst, n)
+    return chunks[k]
+
+
+def eval_model(args):
+    # Model
+    disable_torch_init()
+    model_path = os.path.expanduser(args.model_path)
+    model_name = get_model_name_from_path(model_path)
+    tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name)
+
+    questions = [json.loads(q) for q in open(os.path.expanduser(args.question_file), "r")]
+    questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
+    answers_file = os.path.expanduser(args.answers_file)
+    os.makedirs(os.path.dirname(answers_file), exist_ok=True)
+    
+    # 确保文件扩展名是 .jsonl
+    if not answers_file.endswith('.jsonl'):
+        answers_file = answers_file.rsplit('.', 1)[0] + '.jsonl'
+    
+    with open(answers_file, "w") as ans_file:
+        results = []
+        for line in tqdm(questions):
+            idx = line["question_id"]
+            image_file = line["image"]
+            image_id = line["id"] 
+            qs = line["text"]
+            cur_prompt = qs
+            if model.config.mm_use_im_start_end:
+                qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs
+            else:
+                qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
+
+            conv = conv_templates[args.conv_mode].copy()
+            conv.append_message(conv.roles[0], qs)
+            conv.append_message(conv.roles[1], None)
+            prompt = conv.get_prompt()
+
+            input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
+
+            image = Image.open(os.path.join(args.image_folder, image_file)).convert('RGB')
+            image_tensor = process_images([image], image_processor, model.config)[0]
+
+            with torch.inference_mode():
+                output_ids = model.generate(
+                    input_ids,
+                    images=image_tensor.unsqueeze(0).half().cuda(),
+                    image_sizes=[image.size],
+                    do_sample=True if args.temperature > 0 else False,
+                    temperature=args.temperature,
+                    top_p=args.top_p,
+                    num_beams=args.num_beams,
+                    # no_repeat_ngram_size=3,
+                    max_new_tokens=1024,
+                    use_cache=True)
+
+            outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
+
+            ans_id = shortuuid.uuid()
+            result = {
+                "question_id": idx,
+                "image_id": image_id,
+                "prompt": cur_prompt,
+                "text": outputs,
+                "answer_id": ans_id,
+                "model_id": model_name,
+                "metadata": {}
+            }
+            # 每行写入一个JSON对象
+            ans_file.write(json.dumps(result, ensure_ascii=False) + "\n")
+            ans_file.flush()
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
+    parser.add_argument("--model-base", type=str, default=None)
+    parser.add_argument("--image-folder", type=str, default="")
+    parser.add_argument("--question-file", type=str, default="tables/question.jsonl")
+    parser.add_argument("--answers-file", type=str, default="answer.jsonl")
+    parser.add_argument("--conv-mode", type=str, default="llava_v1")
+    parser.add_argument("--num-chunks", type=int, default=1)
+    parser.add_argument("--chunk-idx", type=int, default=0)
+    parser.add_argument("--temperature", type=float, default=0.2)
+    parser.add_argument("--top_p", type=float, default=None)
+    parser.add_argument("--num_beams", type=int, default=1)
+    args = parser.parse_args()
+
+    eval_model(args)
@@ -1,3 +1,95 @@
+# import torch
+# import os
+# import json
+# from PIL import Image
+# from llava.constants import DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
+# from llava.conversation import conv_templates
+# from llava.model.builder import load_pretrained_model
+# from llava.utils import disable_torch_init
+# from llava.mm_utils import tokenizer_image_token, process_images
+
+
+# def eval_single_image(model_path, image_path, question, conv_mode="llava_v1", temperature=0.2, top_p=None, num_beams=1, max_new_tokens=128):
+#     # 禁用初始化，优化加载速度
+#     disable_torch_init()
+
+#     # 加载模型
+#     print("Loading model and tokenizer...")
+#     model_name = os.path.basename(model_path)
+#     tokenizer, model, image_processor, context_len = load_pretrained_model(
+#         model_path, None, model_name
+#     )
+#     device = "cuda" if torch.cuda.is_available() else "cpu"
+#     model = model.to(device)
+
+#     # 处理问题和对话模板
+#     print("Preparing prompt...")
+#     if hasattr(model.config, "mm_use_im_start_end") and model.config.mm_use_im_start_end:
+#         prompt = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + question
+#     else:
+#         prompt = DEFAULT_IMAGE_TOKEN + '\n' + question
+
+#     conv = conv_templates[conv_mode].copy()
+#     conv.append_message(conv.roles[0], prompt)
+#     conv.append_message(conv.roles[1], None)
+#     final_prompt = conv.get_prompt()
+
+#     # 处理图像
+#     print("Processing image...")
+#     image = Image.open(image_path).convert('RGB')
+#     image_tensor = process_images([image], image_processor, model.config)[0]
+
+#     # Tokenize 输入
+#     input_ids = tokenizer_image_token(
+#         final_prompt, tokenizer, return_tensors="pt"
+#     )
+#     input_ids = input_ids.to(device)
+#     image_tensor = image_tensor.to(dtype=torch.float16, device=device)
+
+#     # 推理
+#     print("Running inference...")
+#     with torch.no_grad():
+#         output_ids = model.generate(
+#             input_ids,
+#             images=image_tensor.unsqueeze(0),  # 加入批量维度
+#             max_new_tokens=max_new_tokens,
+#             num_beams=num_beams,
+#             temperature=temperature,
+#             top_p=top_p,
+#             use_cache=True,
+#         )
+
+#     # 解码生成的回答
+#     answer = tokenizer.decode(output_ids[0], skip_special_tokens=True).strip()
+#     return answer
+
+
+# if __name__ == "__main__":
+#     import argparse
+
+#     parser = argparse.ArgumentParser()
+#     parser.add_argument("--model-path", type=str, required=True, help="Path to the pre-trained model.")
+#     parser.add_argument("--image-path", type=str, required=True, help="Path to the input image.")
+#     parser.add_argument("--question", type=str, required=True, help="Question to ask about the image.")
+#     parser.add_argument("--conv-mode", type=str, default="llava_v1", help="Conversation mode for the model.")
+#     parser.add_argument("--temperature", type=float, default=0.2, help="Sampling temperature.")
+#     parser.add_argument("--top_p", type=float, default=None, help="Nucleus sampling probability.")
+#     parser.add_argument("--num_beams", type=int, default=1, help="Number of beams for beam search.")
+#     parser.add_argument("--max-new-tokens", type=int, default=128, help="Maximum number of tokens to generate.")
+#     args = parser.parse_args()
+
+#     # 执行推理
+#     result = eval_single_image(
+#         model_path=args.model_path,
+#         image_path=args.image_path,
+#         question=args.question,
+#         conv_mode=args.conv_mode,
+#         temperature=args.temperature,
+#         top_p=args.top_p,
+#         num_beams=args.num_beams,
+#         max_new_tokens=args.max_new_tokens,
+#     )
+#     print("\nGenerated Answer:", result)
 import argparse
 import torch
 import os
@@ -141,4 +233,4 @@ def eval_model(args):
     parser.add_argument("--max_new_tokens", type=int, default=128)
     args = parser.parse_args()
 
-    eval_model(args)
+    eval_model(args)
@@ -0,0 +1,36 @@
+import json
+import os
+import random
+
+def create_question_file(coco_path, output_file, num_samples=200, random_seed=42):
+    # 加载COCO验证集信息
+    val_data = json.load(open(os.path.join(coco_path, 'instances_val2014.json')))
+    
+    # 从所有图片中随机采样
+    sampled_images = random.sample(val_data['images'], num_samples)
+    
+    questions = []
+    for i, img in enumerate(sampled_images):
+        questions.append({
+            'image': f"COCO_val2014_{img['id']:012d}.jpg",
+            'id': img['id'],
+            'question_id': i,  # 添加question_id
+            'text': "Please describe this image in detail."  # 添加固定的问题文本
+        })
+    
+    # 保存为jsonl格式
+    with open(output_file, 'w') as f:
+        for q in questions:
+            f.write(json.dumps(q) + '\n')
+
+if __name__ == '__main__':
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--coco-path', type=str, required=True)
+    parser.add_argument('--output-file', type=str, required=True)
+    parser.add_argument('--num-samples', type=int, default=200)
+    parser.add_argument('--random-seed', type=int, default=42)
+    args = parser.parse_args()
+    
+    create_question_file(args.coco_path, args.output_file, 
+                        args.num_samples, args.random_seed)