From 711717711297fc1ec3039d6010c0e106ad14064a Mon Sep 17 00:00:00 2001 From: Zdenek Kasner Date: Thu, 28 Apr 2022 15:42:37 +0200 Subject: [PATCH 1/5] Add templates for schema_guided_dstc8 response generation --- .../dialogues/templates.yaml | 122 ++++++++++++++++++ 1 file changed, 122 insertions(+) create mode 100644 promptsource/templates/schema_guided_dstc8/dialogues/templates.yaml diff --git a/promptsource/templates/schema_guided_dstc8/dialogues/templates.yaml b/promptsource/templates/schema_guided_dstc8/dialogues/templates.yaml new file mode 100644 index 000000000..0b1913db1 --- /dev/null +++ b/promptsource/templates/schema_guided_dstc8/dialogues/templates.yaml @@ -0,0 +1,122 @@ +dataset: schema_guided_dstc8 +subset: dialogues +templates: + 23666a79-c235-4c63-a3c1-b5712c7086b0: !Template + answer_choices: null + id: 23666a79-c235-4c63-a3c1-b5712c7086b0 + jinja: '{% set stop = range(2, turns["utterance"]|length, 2) | random() - 1 %} + + {% for utterance in turns["utterance"][:stop] %} + + {{"Chatbot" if turns["speaker"][loop.index] == 0 else "Human"}}: {{utterance}} + + {% endfor %} + + Chatbot: + + ||| + + {{turns["utterance"][stop]}} + + ' + metadata: !TemplateMetadata + choices_in_prompt: false + metrics: + - BLEU + - ROUGE + original_task: false + name: predict_response_random_human_chatbot + reference: '' + 2abe77de-9b9e-49d5-b17f-b9e28352baf3: !Template + answer_choices: null + id: 2abe77de-9b9e-49d5-b17f-b9e28352baf3 + jinja: '{% set stop = range(2, turns["utterance"]|length, 2) | random() - 1 %} + + {% for utterance in turns["utterance"][:stop] %} + + {{"System" if turns["speaker"][loop.index] == 0 else "User"}}: {{utterance}} + + {% endfor %} + + System: + + ||| + + {{turns["utterance"][stop]}} + + ' + metadata: !TemplateMetadata + choices_in_prompt: false + metrics: + - BLEU + - ROUGE + original_task: false + name: predict_response_random_user_system + reference: '' + 8a31e809-9bc7-4d95-898d-d5dad73b4d6e: !Template + answer_choices: null + id: 8a31e809-9bc7-4d95-898d-d5dad73b4d6e + jinja: 'How would you continue the following dialogue? + + {% set stop = range(2, turns["utterance"]|length, 2) | random() - 1 %} + + {% for utterance in turns["utterance"][:stop] %} + + "{{utterance}}" + + {% endfor %} + + + ||| + + "{{turns["utterance"][stop]}}" + + ' + metadata: !TemplateMetadata + choices_in_prompt: false + metrics: + - BLEU + - ROUGE + original_task: false + name: predict_response_random_continue_dialogue + reference: '' + 9af4e56b-ee5f-47e7-9681-16bbdf11ef5a: !Template + answer_choices: null + id: 9af4e56b-ee5f-47e7-9681-16bbdf11ef5a + jinja: "{% set stop = range(2, turns[\"utterance\"]|length, 2) | random() - 1\ + \ %}\n{% for utterance in turns[\"utterance\"][:stop] %}\n- {{utterance}}\n\ + {% endfor %}\n- \n|||\n{{turns[\"utterance\"][stop]}}" + metadata: !TemplateMetadata + choices_in_prompt: false + metrics: + - ROUGE + - BLEU + original_task: false + name: predict_response_random_simple + reference: '' + e828a273-00eb-4fa1-9b2a-4331682980ca: !Template + answer_choices: null + id: e828a273-00eb-4fa1-9b2a-4331682980ca + jinja: '{% set stop = range(2, turns["utterance"]|length, 2) | random() - 1 %} + + {% for utterance in turns["utterance"][:stop] %} + + {{"B" if turns["speaker"][loop.index] == 0 else "A"}}: {{utterance}} + + {% endfor %} + + B: + + ||| + + {{turns["utterance"][stop]}} + + ' + metadata: !TemplateMetadata + choices_in_prompt: false + metrics: + - BLEU + - ROUGE + original_task: false + name: predict_response_random_a_b + reference: '' From 120a78ffb2958f29a4f99f09abfb03084ce84030 Mon Sep 17 00:00:00 2001 From: Zdenek Kasner Date: Wed, 18 May 2022 14:57:23 +0200 Subject: [PATCH 2/5] Remove extra newlines at the end of targets for schema_guided_dstc8 --- .../schema_guided_dstc8/dialogues/templates.yaml | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/promptsource/templates/schema_guided_dstc8/dialogues/templates.yaml b/promptsource/templates/schema_guided_dstc8/dialogues/templates.yaml index 0b1913db1..0191b4006 100644 --- a/promptsource/templates/schema_guided_dstc8/dialogues/templates.yaml +++ b/promptsource/templates/schema_guided_dstc8/dialogues/templates.yaml @@ -16,9 +16,7 @@ templates: ||| - {{turns["utterance"][stop]}} - - ' + {{turns["utterance"][stop]}}' metadata: !TemplateMetadata choices_in_prompt: false metrics: @@ -42,9 +40,7 @@ templates: ||| - {{turns["utterance"][stop]}} - - ' + {{turns["utterance"][stop]}}' metadata: !TemplateMetadata choices_in_prompt: false metrics: @@ -69,9 +65,7 @@ templates: ||| - "{{turns["utterance"][stop]}}" - - ' + "{{turns["utterance"][stop]}}"' metadata: !TemplateMetadata choices_in_prompt: false metrics: @@ -109,9 +103,7 @@ templates: ||| - {{turns["utterance"][stop]}} - - ' + {{turns["utterance"][stop]}}' metadata: !TemplateMetadata choices_in_prompt: false metrics: From d1f16cfa786ee496588de7117abe5ab6c5326f67 Mon Sep 17 00:00:00 2001 From: Victor SANH Date: Sun, 22 May 2022 22:36:21 +0100 Subject: [PATCH 3/5] Accelerate `get_infos` by caching the `DataseInfoDict`s (#778) * accelerate `get_infos` by caching the `DataseInfoDict`s * quality * consistency --- promptsource/__init__.py | 5 ++++- promptsource/app.py | 32 +++++++++++++++++++++++++------- 2 files changed, 29 insertions(+), 8 deletions(-) diff --git a/promptsource/__init__.py b/promptsource/__init__.py index d4dacf2cb..21eaa37ed 100644 --- a/promptsource/__init__.py +++ b/promptsource/__init__.py @@ -1 +1,4 @@ -DEFAULT_PROMPTSOURCE_CACHE_HOME = "~/.cache/promptsource" +from pathlib import Path + + +DEFAULT_PROMPTSOURCE_CACHE_HOME = str(Path("~/.cache/promptsource").expanduser()) diff --git a/promptsource/app.py b/promptsource/app.py index ed1bc7076..44e712b84 100644 --- a/promptsource/app.py +++ b/promptsource/app.py @@ -1,20 +1,23 @@ import argparse import functools import multiprocessing +import os import textwrap +from hashlib import sha256 from multiprocessing import Manager, Pool import pandas as pd import plotly.express as px import streamlit as st from datasets import get_dataset_infos +from datasets.info import DatasetInfosDict from pygments import highlight from pygments.formatters import HtmlFormatter from pygments.lexers import DjangoLexer -from templates import INCLUDED_USERS +from promptsource import DEFAULT_PROMPTSOURCE_CACHE_HOME from promptsource.session import _get_state -from promptsource.templates import DatasetTemplates, Template, TemplateCollection +from promptsource.templates import INCLUDED_USERS, DatasetTemplates, Template, TemplateCollection from promptsource.utils import ( get_dataset, get_dataset_confs, @@ -25,6 +28,9 @@ ) +DATASET_INFOS_CACHE_DIR = os.path.join(DEFAULT_PROMPTSOURCE_CACHE_HOME, "DATASET_INFOS") +os.makedirs(DATASET_INFOS_CACHE_DIR, exist_ok=True) + # Python 3.8 switched the default start method from fork to spawn. OS X also has # some issues related to fork, eee, e.g., https://github.com/bigscience-workshop/promptsource/issues/572 # so we make sure we always use spawn for consistency @@ -38,7 +44,17 @@ def get_infos(all_infos, d_name): :param all_infos: multiprocess-safe dictionary :param d_name: dataset name """ - all_infos[d_name] = get_dataset_infos(d_name) + d_name_bytes = d_name.encode("utf-8") + d_name_hash = sha256(d_name_bytes) + foldername = os.path.join(DATASET_INFOS_CACHE_DIR, d_name_hash.hexdigest()) + if os.path.isdir(foldername): + infos_dict = DatasetInfosDict.from_directory(foldername) + else: + infos = get_dataset_infos(d_name) + infos_dict = DatasetInfosDict(infos) + os.makedirs(foldername) + infos_dict.write_to_directory(foldername) + all_infos[d_name] = infos_dict # add an argument for read-only @@ -181,11 +197,13 @@ def show_text(t, width=WIDTH, with_markdown=False): else: subset_infos = infos[subset_name] - split_sizes = {k: v.num_examples for k, v in subset_infos.splits.items()} + try: + split_sizes = {k: v.num_examples for k, v in subset_infos.splits.items()} + except Exception: + # Fixing bug in some community datasets. + # For simplicity, just filling `split_sizes` with nothing, so the displayed split sizes will be 0. + split_sizes = {} else: - # Zaid/coqa_expanded and Zaid/quac_expanded don't have dataset_infos.json - # so infos is an empty dic, and `infos[list(infos.keys())[0]]` raises an error - # For simplicity, just filling `split_sizes` with nothing, so the displayed split sizes will be 0. split_sizes = {} # Collect template counts, original task counts and names From bcc5abd874f4b4c53e72e345020dd23a5bb32124 Mon Sep 17 00:00:00 2001 From: Stephen Bach Date: Tue, 24 May 2022 09:28:44 -0400 Subject: [PATCH 4/5] Revert changes to app.py. --- promptsource/app.py | 34 ++++++++-------------------------- 1 file changed, 8 insertions(+), 26 deletions(-) diff --git a/promptsource/app.py b/promptsource/app.py index 4ebc6f9ff..d76d0564f 100644 --- a/promptsource/app.py +++ b/promptsource/app.py @@ -1,23 +1,20 @@ import argparse import functools import multiprocessing -import os import textwrap -from hashlib import sha256 from multiprocessing import Manager, Pool import pandas as pd import plotly.express as px import streamlit as st from datasets import get_dataset_infos -from datasets.info import DatasetInfosDict from pygments import highlight from pygments.formatters import HtmlFormatter from pygments.lexers import DjangoLexer +from templates import INCLUDED_USERS -from promptsource import DEFAULT_PROMPTSOURCE_CACHE_HOME from promptsource.session import _get_state -from promptsource.templates import INCLUDED_USERS, DatasetTemplates, Template, TemplateCollection +from promptsource.templates import DatasetTemplates, Template, TemplateCollection from promptsource.utils import ( get_dataset, get_dataset_confs, @@ -28,9 +25,6 @@ ) -DATASET_INFOS_CACHE_DIR = os.path.join(DEFAULT_PROMPTSOURCE_CACHE_HOME, "DATASET_INFOS") -os.makedirs(DATASET_INFOS_CACHE_DIR, exist_ok=True) - # Python 3.8 switched the default start method from fork to spawn. OS X also has # some issues related to fork, eee, e.g., https://github.com/bigscience-workshop/promptsource/issues/572 # so we make sure we always use spawn for consistency @@ -44,17 +38,7 @@ def get_infos(all_infos, d_name): :param all_infos: multiprocess-safe dictionary :param d_name: dataset name """ - d_name_bytes = d_name.encode("utf-8") - d_name_hash = sha256(d_name_bytes) - foldername = os.path.join(DATASET_INFOS_CACHE_DIR, d_name_hash.hexdigest()) - if os.path.isdir(foldername): - infos_dict = DatasetInfosDict.from_directory(foldername) - else: - infos = get_dataset_infos(d_name) - infos_dict = DatasetInfosDict(infos) - os.makedirs(foldername) - infos_dict.write_to_directory(foldername) - all_infos[d_name] = infos_dict + all_infos[d_name] = get_dataset_infos(d_name) # add an argument for read-only @@ -197,13 +181,11 @@ def show_text(t, width=WIDTH, with_markdown=False): else: subset_infos = infos[subset_name] - try: - split_sizes = {k: v.num_examples for k, v in subset_infos.splits.items()} - except Exception: - # Fixing bug in some community datasets. - # For simplicity, just filling `split_sizes` with nothing, so the displayed split sizes will be 0. - split_sizes = {} + split_sizes = {k: v.num_examples for k, v in subset_infos.splits.items()} else: + # Zaid/coqa_expanded and Zaid/quac_expanded don't have dataset_infos.json + # so infos is an empty dic, and `infos[list(infos.keys())[0]]` raises an error + # For simplicity, just filling `split_sizes` with nothing, so the displayed split sizes will be 0. split_sizes = {} # Collect template counts, original task counts and names @@ -658,4 +640,4 @@ def show_text(t, width=WIDTH, with_markdown=False): if __name__ == "__main__": - run_app() + run_app() \ No newline at end of file From 399ee3b7db4c545354dc9ed48e2ba68704a1e17d Mon Sep 17 00:00:00 2001 From: Stephen Bach Date: Tue, 24 May 2022 09:31:14 -0400 Subject: [PATCH 5/5] Update promptsource/app.py --- promptsource/app.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/promptsource/app.py b/promptsource/app.py index d76d0564f..cc19e6189 100644 --- a/promptsource/app.py +++ b/promptsource/app.py @@ -640,4 +640,4 @@ def show_text(t, width=WIDTH, with_markdown=False): if __name__ == "__main__": - run_app() \ No newline at end of file + run_app()