Skip to content

Commit 8a06e94

Browse files
committed
Initial commit
1 parent c121f04 commit 8a06e94

21 files changed

+2346
-62
lines changed

Diff for: chair.pkl

1.96 MB
Binary file not shown.

Diff for: llava/eval/chair.py

+474
Large diffs are not rendered by default.

Diff for: llava/eval/dense_image_caption.py

+92
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
import torch
2+
import os
3+
import json
4+
from PIL import Image
5+
from llava.constants import DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
6+
from llava.conversation import conv_templates
7+
from llava.model.builder import load_pretrained_model
8+
from llava.utils import disable_torch_init
9+
from llava.mm_utils import tokenizer_image_token, process_images
10+
11+
12+
def eval_single_image(model_path, image_path, question, conv_mode="llava_v1", temperature=0.2, top_p=None, num_beams=1, max_new_tokens=128):
13+
# 禁用初始化,优化加载速度
14+
disable_torch_init()
15+
16+
# 加载模型
17+
print("Loading model and tokenizer...")
18+
model_name = os.path.basename(model_path)
19+
tokenizer, model, image_processor, context_len = load_pretrained_model(
20+
model_path, None, model_name
21+
)
22+
device = "cuda" if torch.cuda.is_available() else "cpu"
23+
model = model.to(device)
24+
25+
# 处理问题和对话模板
26+
print("Preparing prompt...")
27+
if hasattr(model.config, "mm_use_im_start_end") and model.config.mm_use_im_start_end:
28+
prompt = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + question
29+
else:
30+
prompt = DEFAULT_IMAGE_TOKEN + '\n' + question
31+
32+
conv = conv_templates[conv_mode].copy()
33+
conv.append_message(conv.roles[0], prompt)
34+
conv.append_message(conv.roles[1], None)
35+
final_prompt = conv.get_prompt()
36+
37+
# 处理图像
38+
print("Processing image...")
39+
image = Image.open(image_path).convert('RGB')
40+
image_tensor = process_images([image], image_processor, model.config)[0]
41+
42+
# Tokenize 输入
43+
input_ids = tokenizer_image_token(
44+
final_prompt, tokenizer, return_tensors="pt"
45+
)
46+
input_ids = input_ids.to(device)
47+
image_tensor = image_tensor.to(dtype=torch.float16, device=device)
48+
49+
# 推理
50+
print("Running inference...")
51+
with torch.no_grad():
52+
output_ids = model.generate(
53+
input_ids,
54+
images=image_tensor.unsqueeze(0), # 加入批量维度
55+
max_new_tokens=max_new_tokens,
56+
num_beams=num_beams,
57+
temperature=temperature,
58+
top_p=top_p,
59+
use_cache=True,
60+
)
61+
62+
# 解码生成的回答
63+
answer = tokenizer.decode(output_ids[0], skip_special_tokens=True).strip()
64+
return answer
65+
66+
67+
if __name__ == "__main__":
68+
import argparse
69+
70+
parser = argparse.ArgumentParser()
71+
parser.add_argument("--model-path", type=str, required=True, help="Path to the pre-trained model.")
72+
parser.add_argument("--image-path", type=str, required=True, help="Path to the input image.")
73+
parser.add_argument("--question", type=str, required=True, help="Question to ask about the image.")
74+
parser.add_argument("--conv-mode", type=str, default="llava_v1", help="Conversation mode for the model.")
75+
parser.add_argument("--temperature", type=float, default=0.2, help="Sampling temperature.")
76+
parser.add_argument("--top_p", type=float, default=None, help="Nucleus sampling probability.")
77+
parser.add_argument("--num_beams", type=int, default=1, help="Number of beams for beam search.")
78+
parser.add_argument("--max-new-tokens", type=int, default=128, help="Maximum number of tokens to generate.")
79+
args = parser.parse_args()
80+
81+
# 执行推理
82+
result = eval_single_image(
83+
model_path=args.model_path,
84+
image_path=args.image_path,
85+
question=args.question,
86+
conv_mode=args.conv_mode,
87+
temperature=args.temperature,
88+
top_p=args.top_p,
89+
num_beams=args.num_beams,
90+
max_new_tokens=args.max_new_tokens,
91+
)
92+
print("\nGenerated Answer:", result)

Diff for: llava/eval/eval_pope.py

+5-3
Original file line numberDiff line numberDiff line change
@@ -6,15 +6,15 @@ def eval_pope(answers, label_file):
66
label_list = [json.loads(q)['label'] for q in open(label_file, 'r')]
77

88
for answer in answers:
9-
text = answer['text']
9+
text = answer['text'].lower() # 转换为小写
1010

1111
# Only keep the first sentence
1212
if text.find('.') != -1:
1313
text = text.split('.')[0]
1414

1515
text = text.replace(',', '')
1616
words = text.split(' ')
17-
if 'No' in words or 'not' in words or 'no' in words:
17+
if 'no' in words or 'not' in words: # 只需要检查小写形式
1818
answer['text'] = 'no'
1919
else:
2020
answer['text'] = 'yes'
@@ -70,6 +70,8 @@ def eval_pope(answers, label_file):
7070

7171
questions = [json.loads(line) for line in open(args.question_file)]
7272
questions = {question['question_id']: question for question in questions}
73+
74+
7375
answers = [json.loads(q) for q in open(args.result_file)]
7476
for file in os.listdir(args.annotation_dir):
7577
assert file.startswith('coco_pope_')
@@ -78,4 +80,4 @@ def eval_pope(answers, label_file):
7880
cur_answers = [x for x in answers if questions[x['question_id']]['category'] == category]
7981
print('Category: {}, # samples: {}'.format(category, len(cur_answers)))
8082
eval_pope(cur_answers, os.path.join(args.annotation_dir, file))
81-
print("====================================")
83+
print("====================================")

Diff for: llava/eval/model_chair_loader.py

+112
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
import argparse
2+
import torch
3+
import os
4+
import json
5+
from tqdm import tqdm
6+
import shortuuid
7+
8+
from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
9+
from llava.conversation import conv_templates, SeparatorStyle
10+
from llava.model.builder import load_pretrained_model
11+
from llava.utils import disable_torch_init
12+
from llava.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path
13+
14+
from PIL import Image
15+
import math
16+
17+
18+
def split_list(lst, n):
19+
"""Split a list into n (roughly) equal-sized chunks"""
20+
chunk_size = math.ceil(len(lst) / n) # integer division
21+
return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
22+
23+
24+
def get_chunk(lst, n, k):
25+
chunks = split_list(lst, n)
26+
return chunks[k]
27+
28+
29+
def eval_model(args):
30+
# Model
31+
disable_torch_init()
32+
model_path = os.path.expanduser(args.model_path)
33+
model_name = get_model_name_from_path(model_path)
34+
tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name)
35+
36+
questions = [json.loads(q) for q in open(os.path.expanduser(args.question_file), "r")]
37+
questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
38+
answers_file = os.path.expanduser(args.answers_file)
39+
os.makedirs(os.path.dirname(answers_file), exist_ok=True)
40+
41+
# 确保文件扩展名是 .jsonl
42+
if not answers_file.endswith('.jsonl'):
43+
answers_file = answers_file.rsplit('.', 1)[0] + '.jsonl'
44+
45+
with open(answers_file, "w") as ans_file:
46+
results = []
47+
for line in tqdm(questions):
48+
idx = line["question_id"]
49+
image_file = line["image"]
50+
image_id = line["id"]
51+
qs = line["text"]
52+
cur_prompt = qs
53+
if model.config.mm_use_im_start_end:
54+
qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs
55+
else:
56+
qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
57+
58+
conv = conv_templates[args.conv_mode].copy()
59+
conv.append_message(conv.roles[0], qs)
60+
conv.append_message(conv.roles[1], None)
61+
prompt = conv.get_prompt()
62+
63+
input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
64+
65+
image = Image.open(os.path.join(args.image_folder, image_file)).convert('RGB')
66+
image_tensor = process_images([image], image_processor, model.config)[0]
67+
68+
with torch.inference_mode():
69+
output_ids = model.generate(
70+
input_ids,
71+
images=image_tensor.unsqueeze(0).half().cuda(),
72+
image_sizes=[image.size],
73+
do_sample=True if args.temperature > 0 else False,
74+
temperature=args.temperature,
75+
top_p=args.top_p,
76+
num_beams=args.num_beams,
77+
# no_repeat_ngram_size=3,
78+
max_new_tokens=1024,
79+
use_cache=True)
80+
81+
outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
82+
83+
ans_id = shortuuid.uuid()
84+
result = {
85+
"question_id": idx,
86+
"image_id": image_id,
87+
"prompt": cur_prompt,
88+
"text": outputs,
89+
"answer_id": ans_id,
90+
"model_id": model_name,
91+
"metadata": {}
92+
}
93+
# 每行写入一个JSON对象
94+
ans_file.write(json.dumps(result, ensure_ascii=False) + "\n")
95+
ans_file.flush()
96+
97+
if __name__ == "__main__":
98+
parser = argparse.ArgumentParser()
99+
parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
100+
parser.add_argument("--model-base", type=str, default=None)
101+
parser.add_argument("--image-folder", type=str, default="")
102+
parser.add_argument("--question-file", type=str, default="tables/question.jsonl")
103+
parser.add_argument("--answers-file", type=str, default="answer.jsonl")
104+
parser.add_argument("--conv-mode", type=str, default="llava_v1")
105+
parser.add_argument("--num-chunks", type=int, default=1)
106+
parser.add_argument("--chunk-idx", type=int, default=0)
107+
parser.add_argument("--temperature", type=float, default=0.2)
108+
parser.add_argument("--top_p", type=float, default=None)
109+
parser.add_argument("--num_beams", type=int, default=1)
110+
args = parser.parse_args()
111+
112+
eval_model(args)

Diff for: llava/eval/model_vqa_loader.py

+93-1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,95 @@
1+
# import torch
2+
# import os
3+
# import json
4+
# from PIL import Image
5+
# from llava.constants import DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
6+
# from llava.conversation import conv_templates
7+
# from llava.model.builder import load_pretrained_model
8+
# from llava.utils import disable_torch_init
9+
# from llava.mm_utils import tokenizer_image_token, process_images
10+
11+
12+
# def eval_single_image(model_path, image_path, question, conv_mode="llava_v1", temperature=0.2, top_p=None, num_beams=1, max_new_tokens=128):
13+
# # 禁用初始化,优化加载速度
14+
# disable_torch_init()
15+
16+
# # 加载模型
17+
# print("Loading model and tokenizer...")
18+
# model_name = os.path.basename(model_path)
19+
# tokenizer, model, image_processor, context_len = load_pretrained_model(
20+
# model_path, None, model_name
21+
# )
22+
# device = "cuda" if torch.cuda.is_available() else "cpu"
23+
# model = model.to(device)
24+
25+
# # 处理问题和对话模板
26+
# print("Preparing prompt...")
27+
# if hasattr(model.config, "mm_use_im_start_end") and model.config.mm_use_im_start_end:
28+
# prompt = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + question
29+
# else:
30+
# prompt = DEFAULT_IMAGE_TOKEN + '\n' + question
31+
32+
# conv = conv_templates[conv_mode].copy()
33+
# conv.append_message(conv.roles[0], prompt)
34+
# conv.append_message(conv.roles[1], None)
35+
# final_prompt = conv.get_prompt()
36+
37+
# # 处理图像
38+
# print("Processing image...")
39+
# image = Image.open(image_path).convert('RGB')
40+
# image_tensor = process_images([image], image_processor, model.config)[0]
41+
42+
# # Tokenize 输入
43+
# input_ids = tokenizer_image_token(
44+
# final_prompt, tokenizer, return_tensors="pt"
45+
# )
46+
# input_ids = input_ids.to(device)
47+
# image_tensor = image_tensor.to(dtype=torch.float16, device=device)
48+
49+
# # 推理
50+
# print("Running inference...")
51+
# with torch.no_grad():
52+
# output_ids = model.generate(
53+
# input_ids,
54+
# images=image_tensor.unsqueeze(0), # 加入批量维度
55+
# max_new_tokens=max_new_tokens,
56+
# num_beams=num_beams,
57+
# temperature=temperature,
58+
# top_p=top_p,
59+
# use_cache=True,
60+
# )
61+
62+
# # 解码生成的回答
63+
# answer = tokenizer.decode(output_ids[0], skip_special_tokens=True).strip()
64+
# return answer
65+
66+
67+
# if __name__ == "__main__":
68+
# import argparse
69+
70+
# parser = argparse.ArgumentParser()
71+
# parser.add_argument("--model-path", type=str, required=True, help="Path to the pre-trained model.")
72+
# parser.add_argument("--image-path", type=str, required=True, help="Path to the input image.")
73+
# parser.add_argument("--question", type=str, required=True, help="Question to ask about the image.")
74+
# parser.add_argument("--conv-mode", type=str, default="llava_v1", help="Conversation mode for the model.")
75+
# parser.add_argument("--temperature", type=float, default=0.2, help="Sampling temperature.")
76+
# parser.add_argument("--top_p", type=float, default=None, help="Nucleus sampling probability.")
77+
# parser.add_argument("--num_beams", type=int, default=1, help="Number of beams for beam search.")
78+
# parser.add_argument("--max-new-tokens", type=int, default=128, help="Maximum number of tokens to generate.")
79+
# args = parser.parse_args()
80+
81+
# # 执行推理
82+
# result = eval_single_image(
83+
# model_path=args.model_path,
84+
# image_path=args.image_path,
85+
# question=args.question,
86+
# conv_mode=args.conv_mode,
87+
# temperature=args.temperature,
88+
# top_p=args.top_p,
89+
# num_beams=args.num_beams,
90+
# max_new_tokens=args.max_new_tokens,
91+
# )
92+
# print("\nGenerated Answer:", result)
193
import argparse
294
import torch
395
import os
@@ -141,4 +233,4 @@ def eval_model(args):
141233
parser.add_argument("--max_new_tokens", type=int, default=128)
142234
args = parser.parse_args()
143235

144-
eval_model(args)
236+
eval_model(args)

Diff for: llava/eval/prepare_chair_questions.py

+36
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
import json
2+
import os
3+
import random
4+
5+
def create_question_file(coco_path, output_file, num_samples=200, random_seed=42):
6+
# 加载COCO验证集信息
7+
val_data = json.load(open(os.path.join(coco_path, 'instances_val2014.json')))
8+
9+
# 从所有图片中随机采样
10+
sampled_images = random.sample(val_data['images'], num_samples)
11+
12+
questions = []
13+
for i, img in enumerate(sampled_images):
14+
questions.append({
15+
'image': f"COCO_val2014_{img['id']:012d}.jpg",
16+
'id': img['id'],
17+
'question_id': i, # 添加question_id
18+
'text': "Please describe this image in detail." # 添加固定的问题文本
19+
})
20+
21+
# 保存为jsonl格式
22+
with open(output_file, 'w') as f:
23+
for q in questions:
24+
f.write(json.dumps(q) + '\n')
25+
26+
if __name__ == '__main__':
27+
import argparse
28+
parser = argparse.ArgumentParser()
29+
parser.add_argument('--coco-path', type=str, required=True)
30+
parser.add_argument('--output-file', type=str, required=True)
31+
parser.add_argument('--num-samples', type=int, default=200)
32+
parser.add_argument('--random-seed', type=int, default=42)
33+
args = parser.parse_args()
34+
35+
create_question_file(args.coco_path, args.output_file,
36+
args.num_samples, args.random_seed)

0 commit comments

Comments
 (0)