Skip to content

add TUMLU-mini benchmark, solves #577 #811

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
146 changes: 146 additions & 0 deletions community_tasks/turkic_evals.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
# MIT License

# Copyright (c) 2024 The HuggingFace Team

# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:

# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

# ruff: noqa: F405, F403, F401
"""
Task to evaluate LLMs on TUMLU-mini benchmark: https://huggingface.co/datasets/jafarisbarov/TUMLU-mini

For more details, see the associated paper:

@misc{isbarov2025tumluunifiednativelanguage,
title={{TUMLU: A Unified and Native Language Understanding Benchmark for Turkic Languages}},
author={Jafar Isbarov and Arofat Akhundjanova and Mammad Hajili and Kavsar Huseynova and Dmitry Gaynullin and Anar Rzayev and Osman Tursun and Ilshat Saetov and Rinat Kharisov and Saule Belginova and Ariana Kenbayeva and Amina Alisheva and Aizirek Turdubaeva and Abdullatif Köksal and Samir Rustamov and Duygu Ataman},
year={2025},
eprint={2502.11020},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2502.11020},
}
"""

import random
import re
from functools import partial
from typing import Any, Dict, List, Optional, Union

from lighteval.metrics.llm_as_judge import JudgeLM
from lighteval.metrics.metrics import Metric, MetricCategory, Metrics
from lighteval.metrics.utils.metric_utils import MetricUseCase
from lighteval.tasks.default_prompts import LETTER_INDICES
from lighteval.tasks.lighteval_task import LightevalTaskConfig
from lighteval.tasks.requests import Doc


# TUMLU
# fmt: off
TUMLU_SUBSETS = [
"azerbaijani",
"crimean-tatar",
"karakalpak",
"kazakh",
"tatar",
"turkish",
"uyghur",
"uzbek",
"kyrgyz"
]
# fmt: on

INSTRUCTION_BY_LANGUAGE = {
"azerbaijani": "Aşağıdakı sual çoxvariantlı sualdır. Düzgün cavabı seçin:\n\n",
"crimean-tatar": "Aşağıdaki sual çoqtan-çoq cevaplı sualdir. Doğru cevapnı seçip alıñız:\n\n",
"karakalpak": "Tómendegi soraw kóp tańlawlı soraw Tuwrı juwaptı saylań:\n\n",
"kazakh": "Төмендегі сұрақ көп таңдау мүмкіндігі бар сұрақ. Дұрыс жауапты таңдаңыз:\n\n",
"tatar": "Түбәндәге сорау - күп сорау. Дөрес җавапны сайлагыз:\n\n",
"turkish": "Aşağıdaki soru çoktan seçmeli bir sorudur. Doğru cevabı seçin:\n\n",
"uyghur": "تۆۋەندىكى سوئال كۆپ تاللاش سوئالى. توغرا جاۋابنى تاللاڭ:\n\n",
"uzbek": "Quyidagi savol tanlovli savoldir. To‘g‘ri javobni tanlang:\n\n",
"kyrgyz": "Төмөнкү суроо бир нече варианттуу суроо. Туура жоопту тандаңыз:\n\n",
}

ANSWER_BY_LANGUAGE = {
"uzbek": "Javob:",
"uzbek-cyrillic": "Жавоб",
"crimean-tatar": "Cevap:",
"crimean-tatar-cyrillic": "Джевап",
"tatar": "Җавап:",
"kazakh": "Жауап:",
"kazakh-latin": "Jawap",
"karakalpak": "Juwap:",
"kyrgyz": "Жооп:",
"turkish": "Cevap:",
"uyghur": "جاۋاب:",
"uyghur-latin": "Jawab:",
"azerbaijani": "Cavab:",
}


def tumlu_pfn(line, task_name: str = None, language: str = None):
instruction = INSTRUCTION_BY_LANGUAGE[language]

# Create a list of valid choices with corresponding keys
choices = line.get("choices")
valid_keys = ["A", "B", "C", "D", "E"][: len(choices)]

answer_index = valid_keys.index(line.get("answer"))

# Construct the query
query = f"{instruction}{line['question']}\n"
query += "".join([f"{key}. {choice}\n" for key, choice in zip(valid_keys, choices)])
query += ANSWER_BY_LANGUAGE[language]

return Doc(
task_name=task_name,
query=query,
choices=valid_keys, # Return only valid choices
gold_index=answer_index, # Correct index
instruction=instruction,
)


class CustomTUMLUTask(LightevalTaskConfig):
def __init__(
self,
name,
hf_subset,
):
super().__init__(
name=name,
hf_subset=hf_subset,
prompt_function=partial(tumlu_pfn, language=hf_subset),
hf_repo="jafarisbarov/TUMLU-mini",
metric=[Metrics.loglikelihood_acc_norm],
hf_avail_splits=["test", "dev"],
evaluation_splits=["test"],
few_shots_split=["dev"],
few_shots_select="sequential",
suite=["community"],
generation_size=-1,
stop_sequence=None,
trust_dataset=True,
version=0,
)


TUMLU_TASKS = [CustomTUMLUTask(name=f"tumlu:{subset}", hf_subset=subset) for subset in TUMLU_SUBSETS]

TASKS_TABLE = TUMLU_TASKS
Loading