From 7f216baa426068957c2c71bd7ca40984c58d486c Mon Sep 17 00:00:00 2001 From: Jeffrey Quesnelle Date: Mon, 9 Jan 2023 16:03:30 +0000 Subject: [PATCH 1/5] add soda_synethetic_dialogue dataset --- openassistant/datasets/__init__.py | 0 .../soda_synthetic_dialogue/README.md | 94 +++++++ .../soda_synthetic_dialogue/__init__.py | 0 .../datasets/soda_synthetic_dialogue/hub.py | 21 ++ .../soda_synthetic_dialogue/prepare.py | 245 ++++++++++++++++++ .../soda_synthetic_dialogue.py | 123 +++++++++ 6 files changed, 483 insertions(+) create mode 100644 openassistant/datasets/__init__.py create mode 100644 openassistant/datasets/soda_synthetic_dialogue/README.md create mode 100644 openassistant/datasets/soda_synthetic_dialogue/__init__.py create mode 100644 openassistant/datasets/soda_synthetic_dialogue/hub.py create mode 100644 openassistant/datasets/soda_synthetic_dialogue/prepare.py create mode 100644 openassistant/datasets/soda_synthetic_dialogue/soda_synthetic_dialogue.py diff --git a/openassistant/datasets/__init__.py b/openassistant/datasets/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/openassistant/datasets/soda_synthetic_dialogue/README.md b/openassistant/datasets/soda_synthetic_dialogue/README.md new file mode 100644 index 00000000..ef5dfdf8 --- /dev/null +++ b/openassistant/datasets/soda_synthetic_dialogue/README.md @@ -0,0 +1,94 @@ +--- +annotations_creators: +- no-annotation +language: +- en +language_creators: +- machine-generated +license: +- mit +multilinguality: +- monolingual +pretty_name: "SODA Synthetic Dialogue" +size_categories: +- 1M 1 and sys.argv[1] == "--print" + + +def main(output_dir: str = "data"): + """Download and prepare the dataset for use.""" + + random.seed(42) + dataset = load_dataset("allenai/soda") + os.makedirs(output_dir, exist_ok=True) + + for split in ["train", "test", "validation"]: + with open(f"{output_dir}/{split}.jsonl", "w", encoding="utf8") as output: + + for i in tqdm(range(len(dataset[split])), desc=split): + dat = dataset["train"][i] + title = dat["literal"] + story = dat["narrative"] + + if dat["relation"] == "xWant": + theme = "wanting " + dat["tail"] + elif dat["relation"] == "xNeed": + theme = "needing " + dat["tail"] + elif not dat["tail"].startswith("to ") and not dat["tail"].startswith("and "): + theme = "being " + dat["tail"] + elif dat["tail"].startswith("and "): + theme = "people are " + dat["tail"].replace("and PersonY ", "") + else: + theme = dat["tail"] + theme = theme.replace("PersonY", "another person") + theme = theme.replace("being is", "being") + + dialogue = [s2 + ": " + s1 for s1, s2 in zip(dat["dialogue"], dat["speakers"])] + + if random.randint(0, 6) == 0: + # print("##") + # print(f"User: Can you give me a short story description for this dialog?") + # print(" " + "\n ".join(dialog)) + # print(f"Assistant: Sure, a short story description for this dialog could be: \n {story}") + # print("User: And a title?") + # print(f"Assistant: Sure, a title for this dialog could be: \n {title}") + # if theme: + # print("User: What would be one theme of this story?") + # print(f'Assistant: One theme of this story could be: "{theme}"') + conversation = SUMMARY_TEMPLATE.format(dialogue="\n ".join(dialogue), story=story, title=title) + if theme: + conversation = conversation + THEME_TEMPLATE.format(theme=theme) + elif random.randint(0, 6) == 0: + # print("##") + # print(f"User: Can you write a short dialog based on this story:\n {story}") + # print(f"Assistant: Sure, a dialog for this story could be:") + # print(" " + "\n ".join(dialog)) + # print("User: And a title?") + # print(f"Assistant: Sure, a title for this dialog could be: \n {title}") + # if theme: + # print("User: What would be one theme of this story?") + # print(f'Assistant: One theme of this story could be: "{theme}"') + conversation = NEW_DIALOGUE_TEMPLATE.format( + story=story, dialogue="\n ".join(dialogue), title=title + ) + if theme: + conversation = conversation + THEME_TEMPLATE.format(theme=theme) + elif random.randint(0, 3) == 0: + # print("##") + # print(f"User: Can you write the next few lines of dialog for this scene:") + # if random.randint(0, 1) == 0: + # print(" " + "\n ".join(dialog[:-5])) + # print(f"Assistant: Sure, the next dialog for this scene could be:") + # print(" " + "\n ".join(dialog[-5:])) + # elif random.randint(0, 1) == 0: + # print(" " + "\n ".join(dialog[:-3])) + # print(f"Assistant: Sure, the next dialog for this scene could be:") + # print(" " + "\n ".join(dialog[-3:])) + # else: + # print(" " + "\n ".join(dialog[:-4])) + # print(f"Assistant: Sure, the next dialog for this scene could be:") + # print(" " + "\n ".join(dialog[-4:])) + # print("User: And a title?") + # print(f"Assistant: Sure, a title for this dialog could be: \n {title}") + # print("User: How about a short description?") + # print(f"Assistant: Sure, a short description for this dialog could be: \n {story}") + # if theme: + # print("User: What would be one theme of this story?") + # print(f'Assistant: One theme of this story could be: "{theme}"') + if random.randint(0, 1) == 0: + depth = -5 + elif random.randint(0, 1) == 0: + depth = -3 + else: + depth = -4 + conversation = NEXT_LINES_TEMPLATE.format( + scene="\n ".join(dialogue[:depth]), + dialogue="\n ".join(dialogue[depth:]), + title=title, + story=story, + ) + if theme: + conversation = conversation + THEME_TEMPLATE.format(theme=theme) + elif random.randint(0, 3) == 0: + # print("##") + # title1 = title.split(".")[0] + # title2 = title.split(".")[1] + # print(f"User: Can you write short story and dialog about: {title1}") + # print(f'Assistant: Sure, a short story and dialog about: "{title1}" could be:') + # print(f" {story}") + # if random.randint(0, 1) == 0: + # print(" " + "\n ".join(dialog)) + # elif random.randint(0, 1) == 0 and len(dialog) > 5: + # print(" " + "\n ".join(dialog[:-5])) + # print(f'User: Can you provide more dialog assuming "{title2}"?') + # print(f"Assistant: Sure, the next dialog for this scene could be:") + # print(" " + "\n ".join(dialog[-5:])) + # elif random.randint(0, 1) == 0: + # print(" " + "\n ".join(dialog[:-3])) + # print("User: more please.") + # print(f"Assistant: Sure, the next dialog for this scene could be:") + # print(" " + "\n ".join(dialog[-3:])) + # else: + # print(" " + "\n ".join(dialog[:-4])) + # print(f'User: Can you provide more dialog assuming "{title2}"?') + # print(f"Assistant: Sure, the next dialog for this scene could be:") + # print(" " + "\n ".join(dialog[-4:])) + # if theme: + # print("User: What would be one theme of this story?") + # print(f'Assistant: One theme of this story could be: "{theme}"') + title1 = title.split(".")[0] + title2 = title.split(".")[1] + conversation = NEW_STORY_AND_DIALOGUE_TEMPLATE.format(title1=title1, story=story) + if random.randint(0, 1) == 0: + conversation = FULL_DIALOGUE_TEMPLATE.format( + conversation=conversation, dialogue="\n ".join(dialogue) + ) + elif random.randint(0, 1) == 0 and len(dialogue) > 5: + conversation = MORE_DIALOGUE_TEMPLATE.format( + conversation=conversation, + dialogue1="\n ".join(dialogue[:-5]), + title2=title2, + dialogue2="\n ".join(dialogue[-5:]), + ) + elif random.randint(0, 1) == 0: + conversation = NEXT_DIALOGUE_TEMPLATE.format( + conversation=conversation, + dialogue1="\n ".join(dialogue[:-3]), + dialogue2="\n ".join(dialogue[-3:]), + ) + else: + conversation = MORE_DIALOGUE_TEMPLATE.format( + conversation=conversation, + dialogue1="\n ".join(dialogue[:-4]), + title2=title2, + dialogue2="\n ".join(dialogue[-4:]), + ) + if theme: + conversation = conversation + THEME_TEMPLATE.format(theme=theme) + else: + # print("##") + # print(f"User: Can you write short story and dialog based on the theme:\n {theme}") + # print(f'Assistant: Sure, a short story and dialog based on the theme "{theme}" could be:') + # print(f" {story}") + # print(" " + "\n ".join(dialog)) + # print("User: And a title?") + # print(f"Assistant: Sure, a title for this dialog could be: \n {title}") + conversation = NEW_STORY_AND_DIALOGUE_FROM_THEME_TEMPLATE.format( + theme=theme, story=story, dialogue="\n ".join(dialogue), title=title + ) + if PRINT: + print("##") + print(conversation) + + output.write(f"{json.dumps({'conversation': conversation})}\n") + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/openassistant/datasets/soda_synthetic_dialogue/soda_synthetic_dialogue.py b/openassistant/datasets/soda_synthetic_dialogue/soda_synthetic_dialogue.py new file mode 100644 index 00000000..4b588394 --- /dev/null +++ b/openassistant/datasets/soda_synthetic_dialogue/soda_synthetic_dialogue.py @@ -0,0 +1,123 @@ +# Copyright 2023 The OpenAssistant Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +This dataset is a set of dialogues synthesized from the SODA dataset. +In each dialogue, User and Assistant have a conversation about a story. + +The original collab notebook for this dataset can be found at: +https://colab.research.google.com/drive/1Sw3px5dP8whdqT7QMNoqwmqIasZkMbJi?usp=sharing +""" + +import json +from typing import Dict, List, Tuple + +import datasets + +from .hub import OpenAssistantConfig + +from .hub import features + +_CITATION = """\ +@article{ontocard2023sodasynth, + author = {ontocard and Jeffrey Quesnelle}, + title = {SODA Synthetic Dialogue}, + year = {2023} +} +""" +_DATASETNAME = "soda_synthetic_dialogue" +_DISPLAYNAME = "🥤SODA Synthetic Dialogue" +_DESCRIPTION = "A set of dialogues synthesized from the SODA dataset." +_HOMEPAGE = "" +_LICENSE = "mit" +_URLS = { + _DATASETNAME: { + 'train': './data/train.jsonl', + 'test': './data/test.jsonl', + 'validation': './data/validation.jsonl' + } +} +_SUPPORTED_TASKS = ["dialogue-modeling"] +_VERSION = "1.0.0" + + +class SODASyntheticDialogueDataset(datasets.GeneratorBasedBuilder): + """A set of dialogues synthesized from the SODA dataset.""" + + VERSION = datasets.Version(_VERSION) + + BUILDER_CONFIGS = [ + OpenAssistantConfig( + name=f"{_DATASETNAME}_dialogue_modeling", + version=VERSION, + description=f"OpenAssistant dataset config for {_DATASETNAME}", + schema="dialogue_modeling", + subset_id=_DATASETNAME, + ) + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_dialogue_modeling" + + def _info(self) -> datasets.DatasetInfo: + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + + urls = _URLS[_DATASETNAME] + data_dir = dl_manager.download_and_extract(urls) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "filepath": data_dir, + "split": "train" + }, + ), + datasets.SplitGenerator( + name=datasets.Split.TEST, + gen_kwargs={ + "filepath": data_dir, + "split": "test" + }, + ), + datasets.SplitGenerator( + name=datasets.Split.VALIDATION, + gen_kwargs={ + "filepath": data_dir, + "split": "validation" + }, + ), + ] + + def _generate_examples(self, filepath, split: str) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + + if self.config.schema == "dialogue_modeling": + key = 0 + with open(filepath[split], 'r', encoding='utf8') as data: + while True: + line = data.readline() + if not line: + return + yield key, json.loads(line) + key += 1 From c04abe0259047694b0d231e7afc3970dad26ae63 Mon Sep 17 00:00:00 2001 From: Jeffrey Quesnelle Date: Tue, 10 Jan 2023 02:30:36 +0000 Subject: [PATCH 2/5] soda_synthetic_dialogue lint fixes --- .../soda_synthetic_dialogue/README.md | 69 +++++++++++-------- .../soda_synthetic_dialogue/prepare.py | 3 +- .../soda_synthetic_dialogue.py | 27 ++------ 3 files changed, 49 insertions(+), 50 deletions(-) diff --git a/openassistant/datasets/soda_synthetic_dialogue/README.md b/openassistant/datasets/soda_synthetic_dialogue/README.md index ef5dfdf8..595089a7 100644 --- a/openassistant/datasets/soda_synthetic_dialogue/README.md +++ b/openassistant/datasets/soda_synthetic_dialogue/README.md @@ -1,41 +1,41 @@ --- annotations_creators: -- no-annotation + - no-annotation language: -- en + - en language_creators: -- machine-generated + - machine-generated license: -- mit + - mit multilinguality: -- monolingual + - monolingual pretty_name: "SODA Synthetic Dialogue" size_categories: -- 1M Date: Tue, 10 Jan 2023 14:26:04 +0000 Subject: [PATCH 3/5] fix soda_synthetic_dialogue author --- openassistant/datasets/soda_synthetic_dialogue/README.md | 6 +++--- .../soda_synthetic_dialogue/soda_synthetic_dialogue.py | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/openassistant/datasets/soda_synthetic_dialogue/README.md b/openassistant/datasets/soda_synthetic_dialogue/README.md index 595089a7..b645dbfa 100644 --- a/openassistant/datasets/soda_synthetic_dialogue/README.md +++ b/openassistant/datasets/soda_synthetic_dialogue/README.md @@ -89,7 +89,7 @@ text `conversation` feature. ## Source data The script to synthesize this dataset was originally created by -[ontocard](https://github.com/ontocord) in +[ontocord](https://github.com/ontocord) in [this Colab notebook](https://colab.research.google.com/drive/1Sw3px5dP8whdqT7QMNoqwmqIasZkMbJi?usp=sharing) and prepared for Hugging Face by [Jeffrey Quesnelle](https://github.com/jquesnelle/). @@ -99,8 +99,8 @@ and prepared for Hugging Face by Please cite our work if you find the resources in this repository useful: ``` -@article{ontocard2023sodasynth, - author = {ontocard and Jeffrey Quesnelle}, +@article{ontocord2023sodasynth, + author = {ontocord and Jeffrey Quesnelle}, title = {SODA Synthetic Dialogue}, year = {2023} } diff --git a/openassistant/datasets/soda_synthetic_dialogue/soda_synthetic_dialogue.py b/openassistant/datasets/soda_synthetic_dialogue/soda_synthetic_dialogue.py index 856215ec..ddc7c883 100644 --- a/openassistant/datasets/soda_synthetic_dialogue/soda_synthetic_dialogue.py +++ b/openassistant/datasets/soda_synthetic_dialogue/soda_synthetic_dialogue.py @@ -28,8 +28,8 @@ import datasets from .hub import OpenAssistantConfig, features _CITATION = """\ -@article{ontocard2023sodasynth, - author = {ontocard and Jeffrey Quesnelle}, +@article{ontocord2023sodasynth, + author = {ontocord and Jeffrey Quesnelle}, title = {SODA Synthetic Dialogue}, year = {2023} } From 4dd329f5089355bcebe22ea24318cb29daa29beb Mon Sep 17 00:00:00 2001 From: Jeffrey Quesnelle Date: Mon, 16 Jan 2023 22:45:28 -0500 Subject: [PATCH 4/5] add link to soda paper --- openassistant/datasets/soda_synthetic_dialogue/README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/openassistant/datasets/soda_synthetic_dialogue/README.md b/openassistant/datasets/soda_synthetic_dialogue/README.md index b645dbfa..08fde59f 100644 --- a/openassistant/datasets/soda_synthetic_dialogue/README.md +++ b/openassistant/datasets/soda_synthetic_dialogue/README.md @@ -56,7 +56,8 @@ from a title or theme. This data was created by synthesizing the dialogues in [🥤Soda](https://huggingface.co/datasets/allenai/soda) and applying a set of -templates to generate the conversation. +templates to generate the conversation. The original research paper can be +found [here](https://arxiv.org/pdf/2212.10465v1.pdf). Example: From 3222485d6fb23c932745ab929c7883f367a81024 Mon Sep 17 00:00:00 2001 From: Jeffrey Quesnelle Date: Mon, 16 Jan 2023 22:57:47 -0500 Subject: [PATCH 5/5] lint fix --- openassistant/datasets/soda_synthetic_dialogue/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/openassistant/datasets/soda_synthetic_dialogue/README.md b/openassistant/datasets/soda_synthetic_dialogue/README.md index 08fde59f..c4866e16 100644 --- a/openassistant/datasets/soda_synthetic_dialogue/README.md +++ b/openassistant/datasets/soda_synthetic_dialogue/README.md @@ -56,8 +56,8 @@ from a title or theme. This data was created by synthesizing the dialogues in [🥤Soda](https://huggingface.co/datasets/allenai/soda) and applying a set of -templates to generate the conversation. The original research paper can be -found [here](https://arxiv.org/pdf/2212.10465v1.pdf). +templates to generate the conversation. The original research paper can be found +[here](https://arxiv.org/pdf/2212.10465v1.pdf). Example: