Merge branch 'main' of /~https://github.com/EvolvingLMMs-Lab/lmms-eval

EvolvingLMMs-Lab · Jun 18, 2024 · 511b625 · 511b625
2 parents 22a4958 + 050b2c3
commit 511b625
Show file tree

Hide file tree

Showing 449 changed files with 22,285 additions and 582 deletions.
diff --git a/.github/issue_template.md b/.github/issue_template.md
diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
diff --git a/.github/workflows/black.yml b/.github/workflows/black.yml
diff --git a/.gitignore b/.gitignore
@@ -24,3 +24,16 @@ submissions/
 lmms_eval/tasks/hallusion_bench/hallusion_output_vs_model.json
 lmms_eval/tasks/hallusion_bench/hallusion_output_vd_model.json
 zk.log
+cache_dir
+ckpt
+pretrained/
+LLaVA/
+*logs
+temp/
+InternVL/
+logs/
+data/
+llava-video/
+Video-MME/
+VATEX/
+lmms_eval/tasks/vatex/__pycache__/utils.cpython-310.pyc
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,56 @@
+# For the main pipeline structure-related code, we maintain the original license provided with lm-evaluation-harness, which is the MIT License.
+
+MIT License
+
+Copyright (c) 2024 LMMs-Lab
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+# For the multimodal models and datasets that we have added (defined as code in the lmms_eval/tasks and lmms_eval/models folders), we apply the Apache License.
+
+Apache 2.0 License
+
+Copyright (c) 2024 LMMs-Lab
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+When modifying the code, please include the following information about the original lmms-eval source:
+# Adopted from lmms-eval from /~https://github.com/EvolvingLMMs-Lab/lmms-eval. Below is the original copyright:
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
diff --git a/README.md b/README.md
diff --git a/docs/README.md b/docs/README.md
diff --git a/docs/commands.md b/docs/commands.md
diff --git a/docs/current_tasks.md b/docs/current_tasks.md
@@ -0,0 +1,122 @@
+# Current Tasks
+
+> () indicates the task name in the lmms_eval. The task name is also used to specify the dataset in the configuration file.
+> The following is manually updated documentation. You could use `lmms_eval task --list` to list all supported tasks and their task names. 
+
+- AI2D (ai2d)
+- ChartQA (chartqa)
+- CMMMU (cmmmu)
+  - CMMMU Validation (cmmmu_val)
+  - CMMMU Test (cmmmu_test)
+- COCO Caption (coco_cap)
+  - COCO 2014 Caption (coco2014_cap)
+    - COCO 2014 Caption Validation (coco2014_cap_val)
+    - COCO 2014 Caption Test (coco2014_cap_test)
+  - COCO 2017 Caption (coco2017_cap)
+    - COCO 2017 Caption MiniVal (coco2017_cap_val)
+    - COCO 2017 Caption MiniTest (coco2017_cap_test)
+- [ConBench](/~https://github.com/foundation-multimodal-models/ConBench) (conbench)
+- DOCVQA (docvqa)
+  - DOCVQA Validation (docvqa_val)
+  - DOCVQA Test (docvqa_test)
+- Ferret (ferret)
+- Flickr30K (flickr30k)
+  - Ferret Test (ferret_test)
+- GQA (gqa)
+- HallusionBenchmark (hallusion_bench_image)
+- Infographic VQA (info_vqa)
+  - Infographic VQA Validation (info_vqa_val)
+  - Infographic VQA Test (info_vqa_test)
+- LLaVA-Bench (llava_in_the_wild)
+- LLaVA-Bench-COCO (llava_bench_coco)
+- MathVerse (mathverse)
+  - MathVerse Text Dominant (mathverse_testmini_text_dominant)
+  - MathVerse Text Only (mathverse_testmini_text_only)
+  - MathVerse Text Lite (mathverse_testmini_text_lite)
+  - MathVerse Vision Dominant (mathverse_testmini_vision_dominant)
+  - MathVerse Vision Intensive (mathverse_testmini_vision_intensive)
+  - MathVerse Vision Only (mathverse_testmini_vision_only)
+- MathVista (mathvista)
+  - MathVista Validation (mathvista_testmini)
+  - MathVista Test (mathvista_test)
+- MMBench (mmbench)
+  - MMBench English (mmbench_en)
+    - MMBench English Dev (mmbench_en_dev)
+    - MMBench English Test (mmbench_en_test)
+  - MMBench Chinese (mmbench_cn)
+    - MMBench Chinese Dev (mmbench_cn_dev)
+    - MMBench Chinese Test (mmbench_cn_test)
+- MME (mme)
+- MMMU (mmmu)
+  - MMMU Validation (mmmu_val)
+  - MMMU Test (mmmu_test)
+- MMUPD (mmupd)
+  - MMUPD Base (mmupd_base)
+    - MMAAD Base (mmaad_base)
+    - MMIASD Base (mmiasd_base)
+    - MMIVQD Base (mmivqd_base)
+  - MMUPD Option (mmupd_option)
+    - MMAAD Option (mmaad_option)
+    - MMIASD Option (mmiasd_option)
+    - MMIVQD Option (mmivqd_option)
+  - MMUPD Instruction (mmupd_instruction)
+    - MMAAD Instruction (mmaad_instruction)
+    - MMIASD Instruction (mmiasd_instruction)
+    - MMIVQD Instruction (mmivqd_instruction)
+- MMVet (mmvet)
+- Multi-DocVQA (multidocvqa)
+  - Multi-DocVQA Validation (multidocvqa_val)
+  - Multi-DocVQA Test (multidocvqa_test)
+- NoCaps (nocaps)
+  - NoCaps Validation (nocaps_val)
+  - NoCaps Test (nocaps_test)
+- OKVQA (ok_vqa)
+  - OKVQA Validation 2014 (ok_vqa_val2014)
+- POPE (pope)
+- RefCOCO (refcoco)
+    - refcoco_seg_test
+    - refcoco_seg_val
+    - refcoco_seg_testA
+    - refcoco_seg_testB
+    - refcoco_bbox_test
+    - refcoco_bbox_val
+    - refcoco_bbox_testA
+    - refcoco_bbox_testB
+- RefCOCO+ (refcoco+)
+    - refcoco+_seg
+        - refcoco+_seg_val
+        - refcoco+_seg_testA
+        - refcoco+_seg_testB
+    - refcoco+_bbox
+        - refcoco+_bbox_val
+        - refcoco+_bbox_testA
+        - refcoco+_bbox_testB
+- RefCOCOg (refcocog)
+    - refcocog_seg_test
+    - refcocog_seg_val
+    - refcocog_bbox_test
+    - refcocog_bbox_val
+- ScienceQA (scienceqa_full)
+  - ScienceQA Full (scienceqa)
+  - ScienceQA IMG (scienceqa_img)
+- ScreenSpot (screenspot)
+  - ScreenSpot REC / Grounding (screenspot_rec)
+  - ScreenSpot REG / Instruction Generation (screenspot_reg)
+- SeedBench (seedbench)
+- SeedBench 2 (seedbench_2)
+- ST-VQA (stvqa)
+- TextCaps (textcaps)
+  - TextCaps Validation (textcaps_val)
+  - TextCaps Test (textcaps_test)
+- TextVQA (textvqa)
+  - TextVQA Validation (textvqa_val)
+  - TextVQA Test (textvqa_test)
+- VizWizVQA (vizwiz_vqa)
+  - VizWizVQA Validation (vizwiz_vqa_val)
+  - VizWizVQA Test (vizwiz_vqa_test)
+- VQAv2 (vqav2)
+  - VQAv2 Validation (vqav2_val)
+  - VQAv2 Test (vqav2_test)
+- WebSRC (websrc)
+  - WebSRC Validation (websrc_val)
+  - WebSRC Test (websrc_test)
diff --git a/docs/model_guide.md b/docs/model_guide.md
diff --git a/docs/task_guide.md b/docs/task_guide.md
@@ -27,7 +27,7 @@ doc_to_target: "answer"
 generation_kwargs:
   max_new_tokens: 16
   temperature: 0
-  top_p: 0
+  top_p: 1.0
   num_beams: 1
   do_sample: false
 # The return value of process_results will be used by metrics

diff --git a/example_eval.yaml b/example_eval.yaml
diff --git a/lmms_eval/__init__.py b/lmms_eval/__init__.py
diff --git a/lmms_eval/__main__.py b/lmms_eval/__main__.py
@@ -106,9 +106,16 @@ def parse_eval_args() -> argparse.Namespace:
     parser.add_argument(
         "--log_samples_suffix",
         type=str,
-        default="",
+        default="model_outputs",
         help="Specify a suffix for the log_samples file name.",
     )
+    parser.add_argument(
+        "--predict_only",
+        "-x",
+        action="store_true",
+        default=False,
+        help="Use with --log_samples. Only model outputs will be saved and metrics will not be evaluated.",
+    )
     parser.add_argument(
         "--show_config",
         action="store_true",
@@ -228,6 +235,10 @@ def cli_evaluate_single(args: Union[argparse.Namespace, None] = None) -> None:
 
     initialize_tasks(args.verbosity)
 
+    if args.predict_only:
+        args.log_samples = True
+    if (args.log_samples or args.predict_only) and not args.output_path:
+        raise ValueError("Specify --output_path if providing --log_samples or --predict_only")
     if args.limit:
         eval_logger.warning(" --limit SHOULD ONLY BE USED FOR TESTING." "REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.")
     if args.include_path is not None:
@@ -244,14 +255,17 @@ def cli_evaluate_single(args: Union[argparse.Namespace, None] = None) -> None:
             "\n" + "=" * 70 + "\n" + "\n\tYou are trying to check all the numbers in each task." + "\n\tThis action will download the complete dataset." + "\n\tIf the results are not clear initially, call this again." + "\n\n" + "=" * 70
         )
         eval_logger.info(log_message)
-        task_dict = get_task_dict([task for task in sorted(ALL_TASKS)], model_name="llava")
-        for task_name in task_dict.keys():
-            task_obj = task_dict[task_name]
-            if type(task_obj) == tuple:
-                group, task_obj = task_obj
-                if task_obj is None:
-                    continue
-            eval_logger.info(f"\nTask : {task_obj.config.task}\n - #num : {len(task_obj.test_docs()) if task_obj.has_test_docs() else len(task_obj.validation_docs())}")
+        for task_name in sorted(ALL_TASKS):
+            try:
+                task_dict = get_task_dict([task_name], model_name="llava")
+                task_obj = task_dict[task_name]
+                if type(task_obj) == tuple:
+                    group, task_obj = task_obj
+                    if task_obj is None:
+                        continue
+                eval_logger.info(f"\nTask : {task_obj.config.task}\n - #num : {len(task_obj.test_docs()) if task_obj.has_test_docs() else len(task_obj.validation_docs())}")
+            except Exception as e:
+                eval_logger.debug(f"\nTask : {task_name} fail to load \n Exception : \n {e}")
         sys.exit()
     else:
         tasks_list = args.tasks.split(",")
@@ -271,6 +285,10 @@ def cli_evaluate_single(args: Union[argparse.Namespace, None] = None) -> None:
     # set datetime before evaluation
     datetime_str = utils.get_datetime_str(timezone=args.timezone)
     if args.output_path:
+        if args.log_samples_suffix and len(args.log_samples_suffix) > 15:
+            eval_logger.warning("The suffix for log_samples is too long. It is recommended to keep it under 15 characters.")
+            args.log_samples_suffix = args.log_samples_suffix[:5] + "..." + args.log_samples_suffix[-5:]
+
         hash_input = f"{args.model_args}".encode("utf-8")
         hash_output = hashlib.sha256(hash_input).hexdigest()[:6]
         path = Path(args.output_path)
@@ -293,6 +311,7 @@ def cli_evaluate_single(args: Union[argparse.Namespace, None] = None) -> None:
         log_samples=args.log_samples,
         gen_kwargs=args.gen_kwargs,
         cli_args=args,
+        predict_only=args.predict_only,
     )
 
     if results is not None:
@@ -315,9 +334,9 @@ def cli_evaluate_single(args: Union[argparse.Namespace, None] = None) -> None:
                 for task_name, config in results["configs"].items():
                     filename = args.output_path.joinpath(f"{task_name}.json")
                     # Structure the data with 'args' and 'logs' keys
-                    data_to_dump = {"args": vars(args), "model_configs": config, "logs": sorted(samples[task_name], key=lambda x: x["doc_id"])}  # Convert Namespace to dict
-                    samples_dumped = json.dumps(data_to_dump, indent=4, default=_handle_non_serializable)
-                    filename.open("w").write(samples_dumped)
+                    data_to_dump = {"args": vars(args), "model_configs": config, "logs": sorted(samples[task_name], key=lambda x: x["doc_id"]), "time": datetime_str}
+                    samples_dumped = json.dumps(data_to_dump, indent=4, default=_handle_non_serializable, ensure_ascii=False)
+                    filename.open("w", encoding="utf-8").write(samples_dumped)
                     eval_logger.info(f"Saved samples to {filename}")
 
         return results, samples

diff --git a/lmms_eval/api/__init__.py b/lmms_eval/api/__init__.py
diff --git a/lmms_eval/api/filter.py b/lmms_eval/api/filter.py
diff --git a/lmms_eval/api/instance.py b/lmms_eval/api/instance.py
diff --git a/lmms_eval/api/metrics.py b/lmms_eval/api/metrics.py
@@ -16,6 +16,11 @@
 
 
 # Register Aggregations First
+@register_aggregation("bypass")
+def bypass_agg(arr):
+    return 999
+
+
 @register_aggregation("mean")
 def mean(arr):
     return sum(arr) / len(arr)
@@ -226,6 +231,16 @@ def mean_stderr(arr):
     return sample_stddev(arr) / math.sqrt(len(arr))
 
 
+@register_metric(
+    metric="bypass",
+    higher_is_better=True,
+    output_type=["loglikelihood", "multiple_choice", "generate_until"],
+    aggregation="bypass",
+)
+def bypass(items):
+    return items
+
+
 @register_metric(
     metric="mcc",
     higher_is_better=True,

diff --git a/lmms_eval/api/model.py b/lmms_eval/api/model.py
diff --git a/lmms_eval/api/registry.py b/lmms_eval/api/registry.py
@@ -1,6 +1,8 @@
 from lmms_eval.api.model import lmms
 
+from typing import Callable, Dict
 import logging
+import evaluate as hf_evaluate
 
 eval_logger = logging.getLogger("lmms-eval")
 
@@ -104,6 +106,22 @@ def decorate(fn):
     return decorate
 
 
+def get_metric(name: str, hf_evaluate_metric=False) -> Callable:
+    if not hf_evaluate_metric:
+        if name in METRIC_REGISTRY:
+            return METRIC_REGISTRY[name]
+        else:
+            eval_logger.warning(f"Could not find registered metric '{name}' in lm-eval, searching in HF Evaluate library...")
+
+    try:
+        metric_object = hf_evaluate.load(name)
+        return metric_object.compute
+    except Exception:
+        eval_logger.error(
+            f"{name} not found in the evaluate library! Please check https://huggingface.co/evaluate-metric",
+        )
+
+
 def register_aggregation(name):
     def decorate(fn):
         assert name not in AGGREGATION_REGISTRY, f"aggregation named '{name}' conflicts with existing registered aggregation!"

diff --git a/lmms_eval/api/samplers.py b/lmms_eval/api/samplers.py