代码拉取完成,页面将自动刷新
# Copyright (c) 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os, json, argparse, re
import numpy as np
good_threh, bad_threh = 0.8, 0.4
aspects = ["dynamic_degree", "subject_motion_degree", "camera_motion_degree", "light_change",
"static_visual_quality", "aesthetic_quality", "technical_quality", "structural_correctness",
"temporal_visual_quality", "appearance_consistency", "flickering", "motion_naturalness",
"tv_alignment", "tv_alignment_appearance", "tv_alignment_motion"]
def load_json(json_file):
with open(json_file, 'r') as f:
datas = json.load(f)
return datas
def save_json(datas, json_file):
with open(json_file, 'w') as f:
datas = json.dump(datas, f, indent=4)
def exp_decrease(x, left, right, speed=10, init_value=1, final_value=0, reversed=False):
"""
expenential decay from left to right
reversed: if True, decay from right to left
"""
if reversed:
return np.where(x > right,
init_value, # if x > right, return init_value
np.where(x <= right,
np.exp(-speed * (right - x)) * init_value, # if left <= x <= right, use exponential function
final_value)) # if x < left, return final_value
else:
return np.where(x < left,
init_value, # if x < left, return init_value
np.where(x <= right,
np.exp(-speed * (x - left)) * init_value, # if left <= x <= right, use exponential function
final_value)) # if x > right, return final_value
def extract_hard_rating(vlm_rating):
for key in ["video_a", "video_b"]:
rating = re.findall(r'\d+', vlm_rating[key])
vlm_rating[key] = int(rating[0]) if rating else 0
return vlm_rating
def match_pairwise(human_rating, vlm_rating):
"""
human_rating: A is better, B is better, same good, same bad
vlm_rating: the first video, the second video, same good, same bad
"""
if "first video" in vlm_rating.lower():
vlm_rating = "A is better"
elif "second video" in vlm_rating.lower():
vlm_rating = "B is better"
elif "same good" in vlm_rating.lower() or "same high" in vlm_rating.lower() or "both good" in vlm_rating.lower() or "both high" in vlm_rating.lower():
vlm_rating = "same good"
elif "same bad" in vlm_rating.lower() or "same low" in vlm_rating.lower() or "both bad" in vlm_rating.lower() or "both low" in vlm_rating.lower():
vlm_rating = "same bad"
else:
return False, ""
return human_rating == vlm_rating, vlm_rating
def eval_model_rating(human_rating, vlm_rating, video_eval_mode, mllm_eval_mode='score'):
"""
video_eval_mode: how videos are evaluated
mllm_eval_mode: how mllms are evaluated
"""
assert video_eval_mode in ['pairwise', 'pairwise_no_vid_index', 'single_soft_yn', 'single_soft_good_bad', 'single_soft_adaptive',
'single_hard', 'single_soft_reg-dim', 'single_soft_reg-avg']
assert mllm_eval_mode in ['acc', 'score']
if video_eval_mode == "single_hard":
vlm_rating = extract_hard_rating(vlm_rating)
if isinstance(vlm_rating, dict) and isinstance(vlm_rating['video_a'], list):
vlm_rating['video_a'] = vlm_rating['video_a'][0]
vlm_rating['video_b'] = vlm_rating['video_b'][0]
if mllm_eval_mode=='acc' and video_eval_mode.startswith('pairwise'): # video pair preference
return match_pairwise(human_rating, vlm_rating)
elif mllm_eval_mode=='acc' and video_eval_mode.startswith('single'): # adapt single video rating to pairwise preference
if abs(vlm_rating['video_a']-vlm_rating['video_b'])>=0.05 or (vlm_rating['video_a']<good_threh and vlm_rating['video_a']>bad_threh) or (vlm_rating['video_b']<good_threh and vlm_rating['video_b']>bad_threh):
vlm_rating = "A is better" if vlm_rating['video_a']>vlm_rating['video_b'] else "B is better"
elif vlm_rating['video_a']>=good_threh and vlm_rating['video_b']>=good_threh:
vlm_rating = "same good"
elif vlm_rating['video_a']<=bad_threh and vlm_rating['video_b']<=bad_threh:
vlm_rating = "same bad"
else:
print(vlm_rating)
return human_rating==vlm_rating, vlm_rating
elif mllm_eval_mode=='score': # single video rating
if video_eval_mode.startswith('single_soft_reg'): # normalize videoscore rating to [0,1]
for key in ["video_a", "video_b"]:
vlm_rating[key] = (vlm_rating[key]-1) / (4-1)
if human_rating == "A is better":
return vlm_rating["video_a"] > vlm_rating["video_b"], None
elif human_rating == "B is better":
return vlm_rating["video_a"] < vlm_rating["video_b"], None
elif human_rating == "same good":
return exp_decrease(vlm_rating["video_a"], 0, good_threh, reversed=True) * exp_decrease(vlm_rating["video_b"], 0, good_threh, reversed=True), None
elif human_rating == "same bad":
return exp_decrease(vlm_rating["video_a"], bad_threh, 1) * exp_decrease(vlm_rating["video_b"], bad_threh, 1), None
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--result_path', default=None)
parser.add_argument('--result_prefix', default=None)
parser.add_argument('--eval_mode', default=None)
parser.add_argument('--mllm_eval_mode', default=None)
parser.add_argument('--overwrite_merge_result', action='store_true')
args = parser.parse_args()
merged_file = os.path.join(args.result_path, '_'.join(args.result_prefix.split('_')[:-1])+'.json')
if os.path.exists(merged_file) and not args.overwrite_merge_result:
merged_results = load_json(merged_file)
else:
merged_results = {}
result_files = [f"{args.result_path}/{rf}" for rf in os.listdir(args.result_path) if rf.startswith(args.result_prefix)]
for rf in result_files:
results = load_json(rf)
merged_results.update(results)
# os.remove(rf)
save_json(merged_results, merged_file)
# Load Human Annotations
annos = load_json("annotations/annotations.json")
annos_balanced = load_json("annotations/annotations_balanced.json")
# Calculate the results
num_correct, num_total = 0, 0
results = {}
for aspect in aspects:
results[aspect] = {"correct": 0, "total": 0}
for r in merged_results.values():
if args.eval_mode.startswith('pairwise') and (r['idx'] not in annos_balanced or r['subaspect'] not in annos_balanced[r['idx']]['subaspects']):
continue
if (r['idx'] not in annos) or (r['subaspect'] not in annos[r['idx']]['subaspects']):
continue
subaspect = r['subaspect']
if subaspect not in aspects:
continue
if args.mllm_eval_mode is None:
mllm_eval_mode = 'score' if args.eval_mode.startswith('single') else 'acc'
else:
mllm_eval_mode = args.mllm_eval_mode
rating, choice = eval_model_rating(r['human_rating'], r['vlm_rating'], video_eval_mode=args.eval_mode, mllm_eval_mode=mllm_eval_mode) # return rating and the choice of vlm
results[subaspect]['correct'] += rating
results[subaspect]['total'] += 1
num_correct += rating
num_total += 1
for aspect, result in results.items():
if result['total'] > 0:
score = 100*result['correct'] / result['total']
print(f"{aspect}: {score:.2f}% ({result['correct']}/{result['total']})")
print(f"Averate Score: {100*num_correct/num_total:.2f}% ({num_correct:.2f}/{num_total})")
results['avg'] = {"correct": num_correct, "total": num_total}
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。