diff --git a/src/lighteval/metrics/metrics.py b/src/lighteval/metrics/metrics.py index efc762dec..c4cab9b74 100644 --- a/src/lighteval/metrics/metrics.py +++ b/src/lighteval/metrics/metrics.py @@ -133,7 +133,27 @@ class Metrics(Enum): corpus_level_fn=np.mean, higher_is_better=True, ) - + ruler_match_any = SampleLevelMetric( + metric_name="ruler_match_any", + sample_level_fn=lambda predictions, golds, formatted_doc: max( + [1.0 if r.lower() in predictions[0].lower() else 0.0 for r in golds] + ), + category=MetricCategory.GENERATIVE, + use_case=MetricUseCase.SUMMARIZATION, + corpus_level_fn=np.mean, + higher_is_better=True, + ) + ruler_match_all = SampleLevelMetric( + metric_name="ruler_match_all", + sample_level_fn=lambda predictions, golds, formatted_doc: sum( + [1.0 if r.lower() in predictions[0].lower() else 0.0 for r in golds] + ) + / len(golds), + category=MetricCategory.GENERATIVE, + use_case=MetricUseCase.SUMMARIZATION, + corpus_level_fn=np.mean, + higher_is_better=True, + ) bleurt = SampleLevelMetric( metric_name="bleurt", sample_level_fn=BLEURT().compute, diff --git a/src/lighteval/models/vllm/vllm_model.py b/src/lighteval/models/vllm/vllm_model.py index 0b4892a20..32301aa55 100644 --- a/src/lighteval/models/vllm/vllm_model.py +++ b/src/lighteval/models/vllm/vllm_model.py @@ -276,7 +276,7 @@ def greedy_until( if max_new_tokens is not None: if context_size + max_new_tokens > self.max_length: logger.warning( - f"{context_size + max_new_tokens=} which is greater than {self.max_length=}. Truncating context to {self.max_length - max_new_tokens} tokens." + f"{context_size + max_new_tokens=} which is greater than {self.max_length=}. Truncating context to {self.max_length=} - {max_new_tokens=} = {self.max_length - max_new_tokens} tokens." ) context_size = self.max_length - max_new_tokens if context_size < 0: diff --git a/src/lighteval/tasks/default_prompts.py b/src/lighteval/tasks/default_prompts.py index 786c4a0b1..28625e813 100644 --- a/src/lighteval/tasks/default_prompts.py +++ b/src/lighteval/tasks/default_prompts.py @@ -44,6 +44,15 @@ # fmt: on +def ruler(line, task_name: str = None): + query = line["input"] + choices = line["outputs"] + gold_index = 0 + instruction = "Only answer the question to complete the prompt, without any additional text.\n" + query = f"{instruction}{query}" + + return Doc(query=query, instruction=instruction, choices=choices, gold_index=gold_index, task_name=task_name) + def mmmu_pro(line, task_name: Optional[str] = None): # fmt: off question = line["question"] # "What is the capital of France?" @@ -87,7 +96,6 @@ def mmmu_pro(line, task_name: Optional[str] = None): instruction=instructions, ) - def mmmu_pro_vision(line, task_name: str = None): instruction = ( "Answer with the option letter from the given choices directly." @@ -119,14 +127,17 @@ def mmmu_pro_vision(line, task_name: str = None): instruction=instruction, ) - def simpleqa(line, task_name: str = None): query = line["problem"] choices = [line["answer"]] gold_index = 0 return Doc( - task_name=task_name, query=query, choices=choices, gold_index=gold_index, specific={**eval(line["metadata"])} + task_name=task_name, + query=query, + choices=choices, + gold_index=gold_index, + specific={**eval(line["metadata"])}, ) diff --git a/src/lighteval/tasks/extended/__init__.py b/src/lighteval/tasks/extended/__init__.py index 39963eac1..a2cdb6e46 100644 --- a/src/lighteval/tasks/extended/__init__.py +++ b/src/lighteval/tasks/extended/__init__.py @@ -30,9 +30,10 @@ import lighteval.tasks.extended.mix_eval.main as mix_eval import lighteval.tasks.extended.mt_bench.main as mt_bench import lighteval.tasks.extended.olympiade_bench.main as olympiad_bench + import lighteval.tasks.extended.ruler.main as ruler import lighteval.tasks.extended.tiny_benchmarks.main as tiny_benchmarks - AVAILABLE_EXTENDED_TASKS_MODULES = [ifeval, tiny_benchmarks, mt_bench, mix_eval, olympiad_bench, hle, lcb] + AVAILABLE_EXTENDED_TASKS_MODULES = [ifeval, tiny_benchmarks, mt_bench, mix_eval, olympiad_bench, hle, lcb, ruler] else: AVAILABLE_EXTENDED_TASKS_MODULES = [] diff --git a/src/lighteval/tasks/extended/ruler/main.py b/src/lighteval/tasks/extended/ruler/main.py new file mode 100644 index 000000000..62f6cc6ab --- /dev/null +++ b/src/lighteval/tasks/extended/ruler/main.py @@ -0,0 +1,69 @@ +# MIT License + +# Copyright (c) 2024 The HuggingFace Team + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +import lighteval.tasks.default_prompts as prompt +from lighteval.metrics.metrics import Metrics +from lighteval.tasks.lighteval_task import LightevalTaskConfig + + +subsets = [ + "niah_single_1", + "niah_single_2", + "niah_single_3", + "niah_multikey_1", + "niah_multikey_2", + "niah_multikey_3", + "niah_multiquery", + "niah_multivalue", + "vt", + "cwe", + "fwe", + "qa_1", + "qa_2", +] + +lengths = [131072, 65536, 32768, 16384, 8192, 4096] + +task_configs = [] + +for subset in subsets: + for length in lengths: + task_configs.append( + LightevalTaskConfig( + name=f"ruler_{length}:{subset}", + suite=["lighteval"], + prompt_function=prompt.ruler, + hf_repo=f"SaylorTwift/RULER-{length}-llama-3.2-tokenizer", + hf_subset="default", + hf_avail_splits=[subset], + evaluation_splits=[subset], + few_shots_split=None, + few_shots_select=None, + generation_size=128 if "niah" in subset else 30 if subset == "vt" else 120 if subset == "cwe" else 50, + metric=[Metrics.ruler_match_any] if subset in ["qa_1", "qa_2"] else [Metrics.ruler_match_all], + stop_sequence=None, + trust_dataset=False, + version=0, + ) + ) + +TASKS_TABLE = task_configs diff --git a/src/lighteval/tasks/lighteval_task.py b/src/lighteval/tasks/lighteval_task.py index c9a31904b..505c670fc 100644 --- a/src/lighteval/tasks/lighteval_task.py +++ b/src/lighteval/tasks/lighteval_task.py @@ -31,6 +31,7 @@ from huggingface_hub import TextGenerationInputGrammarType from multiprocess import Pool from pytablewriter import MarkdownTableWriter +from tqdm import tqdm from lighteval.metrics import ( apply_generative_metric, @@ -560,7 +561,7 @@ def load_datasets(tasks: list["LightevalTask"], dataset_loading_processes: int = task.dataset_filter, task.dataset_revision, ) - for task in tasks + for task in tqdm(tasks) ] else: with Pool(processes=dataset_loading_processes) as pool: @@ -627,7 +628,7 @@ def create_requests_from_tasks( # noqa: C901 task_dict_items = [(name, task) for name, task in task_dict.items() if len(task.eval_docs()) > 0] # Get lists of each type of request - for task_name, task in task_dict_items: + for task_name, task in tqdm(task_dict_items): task_docs = list(task.eval_docs()) n_samples = min(max_samples, len(task_docs)) if max_samples else len(task_docs) evaluation_tracker.task_config_logger.log_num_docs(task_name, len(task_docs), n_samples)