diff --git a/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/README.md b/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/README.md new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/README.md @@ -0,0 +1 @@ + diff --git a/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/_generate_configs.py b/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/_generate_configs.py new file mode 100644 index 00000000..0f189b82 --- /dev/null +++ b/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/_generate_configs.py @@ -0,0 +1,54 @@ +from datasets import load_dataset + +# dataset = load_dataset("gagan3012/multilingual-llava-bench") + +configs = ['arabic', 'bengali', 'chinese', 'french', 'hindi', 'japanese', 'russian', 'spanish', 'urdu'] + +for config in configs: + yaml_output = f""" + dataset_path: "gagan3012/multilingual-llava-bench" + dataset_kwargs: + config: {config} + token: True + task: "llava_in_the_wild_{config}" + test_split: train + output_type: generate_until + doc_to_visual: !function utils.llava_doc_to_visual + doc_to_text: !function utils.llava_doc_to_text + doc_to_target: "gpt_answer" + generation_kwargs: + until: + - "ASSISTANT:" + image_aspect_ratio: original + max_new_tokens: 1024 + temperature: 0 + top_p: 0 + num_beams: 1 + do_sample: false + process_results: !function utils.llava_process_results + metric_list: + - metric: gpt_eval_llava_all + aggregation: !function utils.llava_all_aggregation + higher_is_better: true + - metric: gpt_eval_llava_conv + aggregation: !function utils.llava_conv_aggregation + higher_is_better: true + - metric: gpt_eval_llava_detail + aggregation: !function utils.llava_detail_aggregation + higher_is_better: true + - metric: gpt_eval_llava_complex + aggregation: !function utils.llava_complex_aggregation + higher_is_better: true + metadata: + version: 0.0 + gpt_eval_model_name: "gpt-4-0613" + model_specific_prompt_kwargs: + default: + pre_prompt: "" + post_prompt: "" + """ + + with open(f"{config}_llava_in_the_wild.yaml", "w") as f: + f.write(yaml_output) + +# Path: _generate_configs.py \ No newline at end of file diff --git a/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/arabic_llava_in_the_wild.yaml b/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/arabic_llava_in_the_wild.yaml new file mode 100644 index 00000000..83447d0d --- /dev/null +++ b/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/arabic_llava_in_the_wild.yaml @@ -0,0 +1,42 @@ + + dataset_path: "gagan3012/multilingual-llava-bench" + dataset_kwargs: + config: arabic + token: True + task: "llava_in_the_wild_arabic" + test_split: train + output_type: generate_until + doc_to_visual: !function utils.llava_doc_to_visual + doc_to_text: !function utils.llava_doc_to_text + doc_to_target: "gpt_answer" + generation_kwargs: + until: + - "ASSISTANT:" + image_aspect_ratio: original + max_new_tokens: 1024 + temperature: 0 + top_p: 0 + num_beams: 1 + do_sample: false + process_results: !function utils.llava_process_results + metric_list: + - metric: gpt_eval_llava_all + aggregation: !function utils.llava_all_aggregation + higher_is_better: true + - metric: gpt_eval_llava_conv + aggregation: !function utils.llava_conv_aggregation + higher_is_better: true + - metric: gpt_eval_llava_detail + aggregation: !function utils.llava_detail_aggregation + higher_is_better: true + - metric: gpt_eval_llava_complex + aggregation: !function utils.llava_complex_aggregation + higher_is_better: true + metadata: + version: 0.0 + gpt_eval_model_name: "gpt-4-0613" + model_specific_prompt_kwargs: + default: + pre_prompt: "" + post_prompt: "" + \ No newline at end of file diff --git a/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/bengali_llava_in_the_wild.yaml b/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/bengali_llava_in_the_wild.yaml new file mode 100644 index 00000000..71ec6f86 --- /dev/null +++ b/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/bengali_llava_in_the_wild.yaml @@ -0,0 +1,42 @@ + + dataset_path: "gagan3012/multilingual-llava-bench" + dataset_kwargs: + config: bengali + token: True + task: "llava_in_the_wild_bengali" + test_split: train + output_type: generate_until + doc_to_visual: !function utils.llava_doc_to_visual + doc_to_text: !function utils.llava_doc_to_text + doc_to_target: "gpt_answer" + generation_kwargs: + until: + - "ASSISTANT:" + image_aspect_ratio: original + max_new_tokens: 1024 + temperature: 0 + top_p: 0 + num_beams: 1 + do_sample: false + process_results: !function utils.llava_process_results + metric_list: + - metric: gpt_eval_llava_all + aggregation: !function utils.llava_all_aggregation + higher_is_better: true + - metric: gpt_eval_llava_conv + aggregation: !function utils.llava_conv_aggregation + higher_is_better: true + - metric: gpt_eval_llava_detail + aggregation: !function utils.llava_detail_aggregation + higher_is_better: true + - metric: gpt_eval_llava_complex + aggregation: !function utils.llava_complex_aggregation + higher_is_better: true + metadata: + version: 0.0 + gpt_eval_model_name: "gpt-4-0613" + model_specific_prompt_kwargs: + default: + pre_prompt: "" + post_prompt: "" + \ No newline at end of file diff --git a/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/chinese_llava_in_the_wild.yaml b/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/chinese_llava_in_the_wild.yaml new file mode 100644 index 00000000..5f8f487d --- /dev/null +++ b/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/chinese_llava_in_the_wild.yaml @@ -0,0 +1,42 @@ + + dataset_path: "gagan3012/multilingual-llava-bench" + dataset_kwargs: + config: chinese + token: True + task: "llava_in_the_wild_chinese" + test_split: train + output_type: generate_until + doc_to_visual: !function utils.llava_doc_to_visual + doc_to_text: !function utils.llava_doc_to_text + doc_to_target: "gpt_answer" + generation_kwargs: + until: + - "ASSISTANT:" + image_aspect_ratio: original + max_new_tokens: 1024 + temperature: 0 + top_p: 0 + num_beams: 1 + do_sample: false + process_results: !function utils.llava_process_results + metric_list: + - metric: gpt_eval_llava_all + aggregation: !function utils.llava_all_aggregation + higher_is_better: true + - metric: gpt_eval_llava_conv + aggregation: !function utils.llava_conv_aggregation + higher_is_better: true + - metric: gpt_eval_llava_detail + aggregation: !function utils.llava_detail_aggregation + higher_is_better: true + - metric: gpt_eval_llava_complex + aggregation: !function utils.llava_complex_aggregation + higher_is_better: true + metadata: + version: 0.0 + gpt_eval_model_name: "gpt-4-0613" + model_specific_prompt_kwargs: + default: + pre_prompt: "" + post_prompt: "" + \ No newline at end of file diff --git a/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/french_llava_in_the_wild.yaml b/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/french_llava_in_the_wild.yaml new file mode 100644 index 00000000..b1c004e3 --- /dev/null +++ b/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/french_llava_in_the_wild.yaml @@ -0,0 +1,42 @@ + + dataset_path: "gagan3012/multilingual-llava-bench" + dataset_kwargs: + config: french + token: True + task: "llava_in_the_wild_french" + test_split: train + output_type: generate_until + doc_to_visual: !function utils.llava_doc_to_visual + doc_to_text: !function utils.llava_doc_to_text + doc_to_target: "gpt_answer" + generation_kwargs: + until: + - "ASSISTANT:" + image_aspect_ratio: original + max_new_tokens: 1024 + temperature: 0 + top_p: 0 + num_beams: 1 + do_sample: false + process_results: !function utils.llava_process_results + metric_list: + - metric: gpt_eval_llava_all + aggregation: !function utils.llava_all_aggregation + higher_is_better: true + - metric: gpt_eval_llava_conv + aggregation: !function utils.llava_conv_aggregation + higher_is_better: true + - metric: gpt_eval_llava_detail + aggregation: !function utils.llava_detail_aggregation + higher_is_better: true + - metric: gpt_eval_llava_complex + aggregation: !function utils.llava_complex_aggregation + higher_is_better: true + metadata: + version: 0.0 + gpt_eval_model_name: "gpt-4-0613" + model_specific_prompt_kwargs: + default: + pre_prompt: "" + post_prompt: "" + \ No newline at end of file diff --git a/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/hindi_llava_in_the_wild.yaml b/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/hindi_llava_in_the_wild.yaml new file mode 100644 index 00000000..507afce7 --- /dev/null +++ b/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/hindi_llava_in_the_wild.yaml @@ -0,0 +1,42 @@ + + dataset_path: "gagan3012/multilingual-llava-bench" + dataset_kwargs: + config: hindi + token: True + task: "llava_in_the_wild_hindi" + test_split: train + output_type: generate_until + doc_to_visual: !function utils.llava_doc_to_visual + doc_to_text: !function utils.llava_doc_to_text + doc_to_target: "gpt_answer" + generation_kwargs: + until: + - "ASSISTANT:" + image_aspect_ratio: original + max_new_tokens: 1024 + temperature: 0 + top_p: 0 + num_beams: 1 + do_sample: false + process_results: !function utils.llava_process_results + metric_list: + - metric: gpt_eval_llava_all + aggregation: !function utils.llava_all_aggregation + higher_is_better: true + - metric: gpt_eval_llava_conv + aggregation: !function utils.llava_conv_aggregation + higher_is_better: true + - metric: gpt_eval_llava_detail + aggregation: !function utils.llava_detail_aggregation + higher_is_better: true + - metric: gpt_eval_llava_complex + aggregation: !function utils.llava_complex_aggregation + higher_is_better: true + metadata: + version: 0.0 + gpt_eval_model_name: "gpt-4-0613" + model_specific_prompt_kwargs: + default: + pre_prompt: "" + post_prompt: "" + \ No newline at end of file diff --git a/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/japanese_llava_in_the_wild.yaml b/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/japanese_llava_in_the_wild.yaml new file mode 100644 index 00000000..a7fb6e96 --- /dev/null +++ b/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/japanese_llava_in_the_wild.yaml @@ -0,0 +1,42 @@ + + dataset_path: "gagan3012/multilingual-llava-bench" + dataset_kwargs: + config: japanese + token: True + task: "llava_in_the_wild_japanese" + test_split: train + output_type: generate_until + doc_to_visual: !function utils.llava_doc_to_visual + doc_to_text: !function utils.llava_doc_to_text + doc_to_target: "gpt_answer" + generation_kwargs: + until: + - "ASSISTANT:" + image_aspect_ratio: original + max_new_tokens: 1024 + temperature: 0 + top_p: 0 + num_beams: 1 + do_sample: false + process_results: !function utils.llava_process_results + metric_list: + - metric: gpt_eval_llava_all + aggregation: !function utils.llava_all_aggregation + higher_is_better: true + - metric: gpt_eval_llava_conv + aggregation: !function utils.llava_conv_aggregation + higher_is_better: true + - metric: gpt_eval_llava_detail + aggregation: !function utils.llava_detail_aggregation + higher_is_better: true + - metric: gpt_eval_llava_complex + aggregation: !function utils.llava_complex_aggregation + higher_is_better: true + metadata: + version: 0.0 + gpt_eval_model_name: "gpt-4-0613" + model_specific_prompt_kwargs: + default: + pre_prompt: "" + post_prompt: "" + \ No newline at end of file diff --git a/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/rule.json b/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/rule.json new file mode 100644 index 00000000..26c7f4e0 --- /dev/null +++ b/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/rule.json @@ -0,0 +1,11 @@ +{ + "coding": {"role": "Assistant", "prompt": "Your task is to evaluate the coding abilities of the above two assistants. They have been asked to implement a program to solve a given problem. Please review their code submissions, paying close attention to their problem-solving approach, code structure, readability, and the inclusion of helpful comments.\n\nPlease ensure that the assistants' submissions:\n\n1. Correctly implement the given problem statement.\n2. Contain accurate and efficient code.\n3. Include clear and concise comments that explain the code's logic and functionality.\n4. Adhere to proper coding standards and best practices.\n\nOnce you have carefully reviewed both submissions, provide detailed feedback on their strengths and weaknesses, along with any suggestions for improvement. You should first output a single line containing two scores on the scale of 1-10 (1: no code/no sense; 10: perfect) for Assistant 1 and 2, respectively. Then give extra comments starting from the next line."}, + "math": {"role": "Assistant", "prompt": "We would like to request your feedback on the mathematical proficiency of two AI assistants regarding the given user question.\nFirstly, please solve the problem independently, without referring to the answers provided by Assistant 1 and Assistant 2.\nAfterward, please examine the problem-solving process of Assistant 1 and Assistant 2 step-by-step to ensure their correctness, identifying any incorrect steps if present. Your evaluation should take into account not only the answer but also the problem-solving steps.\nFinally, please output a Python tuple containing two numerical scores for Assistant 1 and Assistant 2, ranging from 1 to 10, respectively. If applicable, explain the reasons for any variations in their scores and determine which assistant performed better."}, + "default": {"role": "Assistant", "prompt": "We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above.\nPlease rate the helpfulness, relevance, accuracy, level of details of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space.\nIn the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment."}, + "conv": {"role": "Assistant", "prompt": "We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above. The user asks the question on observing an image. For your reference, the visual content in the image is represented with five descriptive sentences describing the same image and the bounding box coordinates of each object in the scene. These coordinates are in the form of bounding boxes, represented as (x1, y1, x2, y2) with floating numbers ranging from 0 to 1. These values correspond to the top left x, top left y, bottom right x, and bottom right y. \nPlease rate the helpfulness, relevance, accuracy, level of details of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space.\nIn the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment."}, + "detail": {"role": "Assistant", "prompt": "We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above. The user asks the question on observing an image. For your reference, the visual content in the image is represented with five descriptive sentences describing the same image and the bounding box coordinates of each object in the scene. These coordinates are in the form of bounding boxes, represented as (x1, y1, x2, y2) with floating numbers ranging from 0 to 1. These values correspond to the top left x, top left y, bottom right x, and bottom right y. \nPlease rate the helpfulness, relevance, accuracy, level of details of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space.\nIn the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment."}, + "complex": {"role": "Assistant", "prompt": "We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above. The user asks the question on observing an image. For your reference, the visual content in the image is represented with five descriptive sentences describing the same image and the bounding box coordinates of each object in the scene. These coordinates are in the form of bounding boxes, represented as (x1, y1, x2, y2) with floating numbers ranging from 0 to 1. These values correspond to the top left x, top left y, bottom right x, and bottom right y. \nPlease rate the helpfulness, relevance, accuracy, level of details of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space.\nIn the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment."}, + "llava_bench_conv": {"role": "Assistant", "prompt": "We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above. The user asks the question on observing an image. For your reference, the visual content in the image is represented with a few sentences describing the image. \nPlease rate the helpfulness, relevance, accuracy, level of details of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space.\nIn the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment."}, + "llava_bench_detail": {"role": "Assistant", "prompt": "We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above. The user asks the question on observing an image. For your reference, the visual content in the image is represented with a few sentences describing the image. \nPlease rate the helpfulness, relevance, accuracy, level of details of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space.\nIn the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment."}, + "llava_bench_complex": {"role": "Assistant", "prompt": "We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above. The user asks the question on observing an image. For your reference, the visual content in the image is represented with a few sentences describing the image. \nPlease rate the helpfulness, relevance, accuracy, level of details of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space.\nIn the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment."} +} \ No newline at end of file diff --git a/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/russian_llava_in_the_wild.yaml b/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/russian_llava_in_the_wild.yaml new file mode 100644 index 00000000..ecb9c146 --- /dev/null +++ b/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/russian_llava_in_the_wild.yaml @@ -0,0 +1,42 @@ + + dataset_path: "gagan3012/multilingual-llava-bench" + dataset_kwargs: + config: russian + token: True + task: "llava_in_the_wild_russian" + test_split: train + output_type: generate_until + doc_to_visual: !function utils.llava_doc_to_visual + doc_to_text: !function utils.llava_doc_to_text + doc_to_target: "gpt_answer" + generation_kwargs: + until: + - "ASSISTANT:" + image_aspect_ratio: original + max_new_tokens: 1024 + temperature: 0 + top_p: 0 + num_beams: 1 + do_sample: false + process_results: !function utils.llava_process_results + metric_list: + - metric: gpt_eval_llava_all + aggregation: !function utils.llava_all_aggregation + higher_is_better: true + - metric: gpt_eval_llava_conv + aggregation: !function utils.llava_conv_aggregation + higher_is_better: true + - metric: gpt_eval_llava_detail + aggregation: !function utils.llava_detail_aggregation + higher_is_better: true + - metric: gpt_eval_llava_complex + aggregation: !function utils.llava_complex_aggregation + higher_is_better: true + metadata: + version: 0.0 + gpt_eval_model_name: "gpt-4-0613" + model_specific_prompt_kwargs: + default: + pre_prompt: "" + post_prompt: "" + \ No newline at end of file diff --git a/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/spanish_llava_in_the_wild.yaml b/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/spanish_llava_in_the_wild.yaml new file mode 100644 index 00000000..02641ca2 --- /dev/null +++ b/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/spanish_llava_in_the_wild.yaml @@ -0,0 +1,42 @@ + + dataset_path: "gagan3012/multilingual-llava-bench" + dataset_kwargs: + config: spanish + token: True + task: "llava_in_the_wild_spanish" + test_split: train + output_type: generate_until + doc_to_visual: !function utils.llava_doc_to_visual + doc_to_text: !function utils.llava_doc_to_text + doc_to_target: "gpt_answer" + generation_kwargs: + until: + - "ASSISTANT:" + image_aspect_ratio: original + max_new_tokens: 1024 + temperature: 0 + top_p: 0 + num_beams: 1 + do_sample: false + process_results: !function utils.llava_process_results + metric_list: + - metric: gpt_eval_llava_all + aggregation: !function utils.llava_all_aggregation + higher_is_better: true + - metric: gpt_eval_llava_conv + aggregation: !function utils.llava_conv_aggregation + higher_is_better: true + - metric: gpt_eval_llava_detail + aggregation: !function utils.llava_detail_aggregation + higher_is_better: true + - metric: gpt_eval_llava_complex + aggregation: !function utils.llava_complex_aggregation + higher_is_better: true + metadata: + version: 0.0 + gpt_eval_model_name: "gpt-4-0613" + model_specific_prompt_kwargs: + default: + pre_prompt: "" + post_prompt: "" + \ No newline at end of file diff --git a/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/urdu_llava_in_the_wild.yaml b/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/urdu_llava_in_the_wild.yaml new file mode 100644 index 00000000..6bf28c0b --- /dev/null +++ b/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/urdu_llava_in_the_wild.yaml @@ -0,0 +1,42 @@ + + dataset_path: "gagan3012/multilingual-llava-bench" + dataset_kwargs: + config: urdu + token: True + task: "llava_in_the_wild_urdu" + test_split: train + output_type: generate_until + doc_to_visual: !function utils.llava_doc_to_visual + doc_to_text: !function utils.llava_doc_to_text + doc_to_target: "gpt_answer" + generation_kwargs: + until: + - "ASSISTANT:" + image_aspect_ratio: original + max_new_tokens: 1024 + temperature: 0 + top_p: 0 + num_beams: 1 + do_sample: false + process_results: !function utils.llava_process_results + metric_list: + - metric: gpt_eval_llava_all + aggregation: !function utils.llava_all_aggregation + higher_is_better: true + - metric: gpt_eval_llava_conv + aggregation: !function utils.llava_conv_aggregation + higher_is_better: true + - metric: gpt_eval_llava_detail + aggregation: !function utils.llava_detail_aggregation + higher_is_better: true + - metric: gpt_eval_llava_complex + aggregation: !function utils.llava_complex_aggregation + higher_is_better: true + metadata: + version: 0.0 + gpt_eval_model_name: "gpt-4-0613" + model_specific_prompt_kwargs: + default: + pre_prompt: "" + post_prompt: "" + \ No newline at end of file diff --git a/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/utils.py b/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/utils.py new file mode 100644 index 00000000..ac86ee99 --- /dev/null +++ b/lmms_eval/tasks/multilingual-llava-bench-in-the-wild/utils.py @@ -0,0 +1,197 @@ +import json +import logging +import os +import requests +import numpy as np +import openai +from openai import OpenAI +import time +import yaml +from pathlib import Path +from copy import deepcopy + +eval_logger = logging.getLogger("lmms-eval") +NUM_SECONDS_TO_SLEEP = 5 + +LLAVA_W_METRICS = ["gpt_eval_llava_conv", "gpt_eval_llava_detail", "gpt_eval_llava_complex"] + +rule_dict = json.load(open(os.path.join(os.path.dirname(os.path.abspath(__file__)), "rule.json"), "r")) + +with open(Path(__file__).parent / "llava-in-the-wild.yaml", "r") as f: + raw_data = f.readlines() + safe_data = [] + for i, line in enumerate(raw_data): + # remove function definition since yaml load cannot handle it + if "!function" not in line: + safe_data.append(line) + + config = yaml.safe_load("".join(safe_data)) + +GPT_EVAL_MODEL_NAME = config["metadata"]["gpt_eval_model_name"] + +API_TYPE = os.getenv("API_TYPE", "openai") + +if API_TYPE == "openai": + API_URL = os.getenv("OPENAI_API_URL", "https://api.openai.com/v1/chat/completions") + API_KEY = os.getenv("OPENAI_API_KEY", "YOUR_API_KEY") + headers = { + "Authorization": f"Bearer {API_KEY}", + "Content-Type": "application/json", + } +elif API_TYPE == "azure": + API_URL = os.getenv("AZURE_ENDPOINT", "https://api.cognitive.microsoft.com/sts/v1.0/issueToken") + API_KEY = os.getenv("AZURE_API_KEY", "YOUR_API_KEY") + headers = { + "api-key": API_KEY, + "Content-Type": "application/json", + } + + +def get_eval(content: str, max_tokens: int, retries: int = 5): + global headers + + messages = [ + { + "role": "system", + "content": "You are a helpful and precise assistant for checking the quality of the answer.", + }, + {"role": "user", "content": content}, + ] + + payload = { + "model": GPT_EVAL_MODEL_NAME, + "messages": messages, + "temperature": 0.2, + "max_tokens": max_tokens, + } + + for attempt in range(retries): + try: + response = requests.post(API_URL, headers=headers, json=payload, timeout=60) + response.raise_for_status() + response_data = response.json() + + content = response_data["choices"][0]["message"]["content"].strip() + if content != "": + return content, response_data["model"] + break # If successful, break out of the loop + + except Exception as e: + eval_logger.info(f"Attempt {attempt + 1} failed with error: {e}") + if attempt < retries: # If we have retries left, sleep and then continue to next attempt + time.sleep(NUM_SECONDS_TO_SLEEP) + else: # If this was the last attempt, log and return empty + eval_logger.error(f"All {retries} attempts failed. Last error message: {e}") + return "", "" + return "", "" + + +def parse_score(review): + try: + score_pair = review.split("\n")[0] + score_pair = score_pair.replace(",", " ") + sp = score_pair.split(" ") + if len(sp) == 2: + return [float(sp[0]), float(sp[1])] + else: + eval_logger.debug(f"Can not split: {review}. Returning [-1, -1]") + return [-1, -1] + except Exception as e: + eval_logger.debug(f"Error: {e}. Returning [-1, -1]") + return [-1, -1] + + +def llava_doc_to_visual(doc): + return [doc["image"].convert("RGB")] + + +def llava_doc_to_text(doc, model_specific_prompt_kwargs=None): + if model_specific_prompt_kwargs is None: + model_specific_prompt_kwargs = {} + pre_prompt = model_specific_prompt_kwargs.get("pre_prompt", "") + post_prompt = model_specific_prompt_kwargs.get("post_prompt", "") + return f"{pre_prompt}{doc['question']}{post_prompt}" + + +def llava_process_results(doc, result): + """ + Args: + doc: a instance of the eval dataset + results: [pred] + Returns: + a dictionary with key: metric name (in this case coco_bleu), value: metric value + """ + try: + question = doc.get("question", "") + ans1 = doc.get("gpt_answer", "") + ans2 = result[0] if result else "" + captions = doc.get("caption", []) + context = "\n".join(captions) if isinstance(captions, list) else captions + category = "llava_bench_" + doc.get("category", "") + rule = rule_dict.get(category, {}) + prompt = rule.get("prompt", "") + role = rule.get("role", "user") + content = f"[Context]\n{context}\n\n" f"[Question]\n{question}\n\n" f"[{role} 1]\n{ans1}\n\n[End of {role} 1]\n\n" f"[{role} 2]\n{ans2}\n\n[End of {role} 2]\n\n" f"[System]\n{prompt}\n\n" + + review, model_name = get_eval(content, 1024) + scores = parse_score(review) + except Exception as e: + eval_logger.error(f"Error for Question ID: {doc.get('question_id', 'Unknown')}: {e}") + review = "Failed to Get a Proper Review." + model_name = "Failed Request" + scores = [-1, -1] + + metric = f"gpt_eval_llava_{doc.get('category', 'all')}" + category_review_dict = {"question": question, "ans1": ans1, "ans2": ans2, "context": context, "category": category, "review": review, "scores": scores, "eval_model": model_name, "content": content} + + non_category_review_dict = deepcopy(category_review_dict) + non_category_review_dict["scores"] = [-999, -999] + + data_dict = {} + for m in LLAVA_W_METRICS: + if m == metric: + data_dict[m] = category_review_dict + else: + data_dict[m] = non_category_review_dict + data_dict["gpt_eval_llava_all"] = category_review_dict + + # return {"gpt_eval_llava_all": review_dict} + return data_dict + + +def llava_conv_aggregation(results): + return llava_aggregation(results, "conv") + + +def llava_complex_aggregation(results): + return llava_aggregation(results, "complex") + + +def llava_detail_aggregation(results): + return llava_aggregation(results, "detail") + + +def llava_all_aggregation(results): + return llava_aggregation(results, "all") + + +def llava_aggregation(results, category): + try: + scores = [] + for result in results: + if -999 in result["scores"]: + continue + scores.append(result["scores"]) + + stats = np.asarray(scores).mean(0).tolist() + stats = [round(x, 3) for x in stats] + # gpt4_score_percentage = stats[0] * 10 + # model_score_percentage = stats[1] * 10 + # eval_logger.info(f"Category: {category}") + # eval_logger.info(f"GPT4 Score: {gpt4_score_percentage:.1f}%") + # eval_logger.info(f"Model Score: {model_score_percentage:.1f}%") + # eval_logger.info("=========================") + return round(stats[1] / stats[0] * 100, 1) + except Exception as e: + eval_logger.info(f"Error in llava_aggregation: {e}, and in category: {category}") + return None