diff --git a/openassistant/datasets/__init__.py b/openassistant/datasets/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/openassistant/datasets/soda_synthetic_dialogue/README.md b/openassistant/datasets/soda_synthetic_dialogue/README.md new file mode 100644 index 00000000..c4866e16 --- /dev/null +++ b/openassistant/datasets/soda_synthetic_dialogue/README.md @@ -0,0 +1,108 @@ +--- +annotations_creators: + - no-annotation +language: + - en +language_creators: + - machine-generated +license: + - mit +multilinguality: + - monolingual +pretty_name: "SODA Synthetic Dialogue" +size_categories: + - 1M 1 and sys.argv[1] == "--print" + + +def main(output_dir: str = "data"): + """Download and prepare the dataset for use.""" + + random.seed(42) + dataset = load_dataset("allenai/soda") + os.makedirs(output_dir, exist_ok=True) + + for split in ["train", "test", "validation"]: + with open(f"{output_dir}/{split}.jsonl", "w", encoding="utf8") as output: + + for i in tqdm(range(len(dataset[split])), desc=split): + dat = dataset["train"][i] + title = dat["literal"] + story = dat["narrative"] + + if dat["relation"] == "xWant": + theme = "wanting " + dat["tail"] + elif dat["relation"] == "xNeed": + theme = "needing " + dat["tail"] + elif not dat["tail"].startswith("to ") and not dat["tail"].startswith("and "): + theme = "being " + dat["tail"] + elif dat["tail"].startswith("and "): + theme = "people are " + dat["tail"].replace("and PersonY ", "") + else: + theme = dat["tail"] + theme = theme.replace("PersonY", "another person") + theme = theme.replace("being is", "being") + + dialogue = [s2 + ": " + s1 for s1, s2 in zip(dat["dialogue"], dat["speakers"])] + + if random.randint(0, 6) == 0: + # print("##") + # print(f"User: Can you give me a short story description for this dialog?") + # print(" " + "\n ".join(dialog)) + # print(f"Assistant: Sure, a short story description for this dialog could be: \n {story}") + # print("User: And a title?") + # print(f"Assistant: Sure, a title for this dialog could be: \n {title}") + # if theme: + # print("User: What would be one theme of this story?") + # print(f'Assistant: One theme of this story could be: "{theme}"') + conversation = SUMMARY_TEMPLATE.format(dialogue="\n ".join(dialogue), story=story, title=title) + if theme: + conversation = conversation + THEME_TEMPLATE.format(theme=theme) + elif random.randint(0, 6) == 0: + # print("##") + # print(f"User: Can you write a short dialog based on this story:\n {story}") + # print(f"Assistant: Sure, a dialog for this story could be:") + # print(" " + "\n ".join(dialog)) + # print("User: And a title?") + # print(f"Assistant: Sure, a title for this dialog could be: \n {title}") + # if theme: + # print("User: What would be one theme of this story?") + # print(f'Assistant: One theme of this story could be: "{theme}"') + conversation = NEW_DIALOGUE_TEMPLATE.format( + story=story, dialogue="\n ".join(dialogue), title=title + ) + if theme: + conversation = conversation + THEME_TEMPLATE.format(theme=theme) + elif random.randint(0, 3) == 0: + # print("##") + # print(f"User: Can you write the next few lines of dialog for this scene:") + # if random.randint(0, 1) == 0: + # print(" " + "\n ".join(dialog[:-5])) + # print(f"Assistant: Sure, the next dialog for this scene could be:") + # print(" " + "\n ".join(dialog[-5:])) + # elif random.randint(0, 1) == 0: + # print(" " + "\n ".join(dialog[:-3])) + # print(f"Assistant: Sure, the next dialog for this scene could be:") + # print(" " + "\n ".join(dialog[-3:])) + # else: + # print(" " + "\n ".join(dialog[:-4])) + # print(f"Assistant: Sure, the next dialog for this scene could be:") + # print(" " + "\n ".join(dialog[-4:])) + # print("User: And a title?") + # print(f"Assistant: Sure, a title for this dialog could be: \n {title}") + # print("User: How about a short description?") + # print(f"Assistant: Sure, a short description for this dialog could be: \n {story}") + # if theme: + # print("User: What would be one theme of this story?") + # print(f'Assistant: One theme of this story could be: "{theme}"') + if random.randint(0, 1) == 0: + depth = -5 + elif random.randint(0, 1) == 0: + depth = -3 + else: + depth = -4 + conversation = NEXT_LINES_TEMPLATE.format( + scene="\n ".join(dialogue[:depth]), + dialogue="\n ".join(dialogue[depth:]), + title=title, + story=story, + ) + if theme: + conversation = conversation + THEME_TEMPLATE.format(theme=theme) + elif random.randint(0, 3) == 0: + # print("##") + # title1 = title.split(".")[0] + # title2 = title.split(".")[1] + # print(f"User: Can you write short story and dialog about: {title1}") + # print(f'Assistant: Sure, a short story and dialog about: "{title1}" could be:') + # print(f" {story}") + # if random.randint(0, 1) == 0: + # print(" " + "\n ".join(dialog)) + # elif random.randint(0, 1) == 0 and len(dialog) > 5: + # print(" " + "\n ".join(dialog[:-5])) + # print(f'User: Can you provide more dialog assuming "{title2}"?') + # print(f"Assistant: Sure, the next dialog for this scene could be:") + # print(" " + "\n ".join(dialog[-5:])) + # elif random.randint(0, 1) == 0: + # print(" " + "\n ".join(dialog[:-3])) + # print("User: more please.") + # print(f"Assistant: Sure, the next dialog for this scene could be:") + # print(" " + "\n ".join(dialog[-3:])) + # else: + # print(" " + "\n ".join(dialog[:-4])) + # print(f'User: Can you provide more dialog assuming "{title2}"?') + # print(f"Assistant: Sure, the next dialog for this scene could be:") + # print(" " + "\n ".join(dialog[-4:])) + # if theme: + # print("User: What would be one theme of this story?") + # print(f'Assistant: One theme of this story could be: "{theme}"') + title1 = title.split(".")[0] + title2 = title.split(".")[1] + conversation = NEW_STORY_AND_DIALOGUE_TEMPLATE.format(title1=title1, story=story) + if random.randint(0, 1) == 0: + conversation = FULL_DIALOGUE_TEMPLATE.format( + conversation=conversation, dialogue="\n ".join(dialogue) + ) + elif random.randint(0, 1) == 0 and len(dialogue) > 5: + conversation = MORE_DIALOGUE_TEMPLATE.format( + conversation=conversation, + dialogue1="\n ".join(dialogue[:-5]), + title2=title2, + dialogue2="\n ".join(dialogue[-5:]), + ) + elif random.randint(0, 1) == 0: + conversation = NEXT_DIALOGUE_TEMPLATE.format( + conversation=conversation, + dialogue1="\n ".join(dialogue[:-3]), + dialogue2="\n ".join(dialogue[-3:]), + ) + else: + conversation = MORE_DIALOGUE_TEMPLATE.format( + conversation=conversation, + dialogue1="\n ".join(dialogue[:-4]), + title2=title2, + dialogue2="\n ".join(dialogue[-4:]), + ) + if theme: + conversation = conversation + THEME_TEMPLATE.format(theme=theme) + else: + # print("##") + # print(f"User: Can you write short story and dialog based on the theme:\n {theme}") + # print(f'Assistant: Sure, a short story and dialog based on the theme "{theme}" could be:') + # print(f" {story}") + # print(" " + "\n ".join(dialog)) + # print("User: And a title?") + # print(f"Assistant: Sure, a title for this dialog could be: \n {title}") + conversation = NEW_STORY_AND_DIALOGUE_FROM_THEME_TEMPLATE.format( + theme=theme, story=story, dialogue="\n ".join(dialogue), title=title + ) + if PRINT: + print("##") + print(conversation) + + output.write(f"{json.dumps({'conversation': conversation})}\n") + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/openassistant/datasets/soda_synthetic_dialogue/soda_synthetic_dialogue.py b/openassistant/datasets/soda_synthetic_dialogue/soda_synthetic_dialogue.py new file mode 100644 index 00000000..ddc7c883 --- /dev/null +++ b/openassistant/datasets/soda_synthetic_dialogue/soda_synthetic_dialogue.py @@ -0,0 +1,108 @@ +# Copyright 2023 The OpenAssistant Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +This dataset is a set of dialogues synthesized from the SODA dataset. +In each dialogue, User and Assistant have a conversation about a story. + +The original collab notebook for this dataset can be found at: +https://colab.research.google.com/drive/1Sw3px5dP8whdqT7QMNoqwmqIasZkMbJi?usp=sharing +""" + +import json +from typing import Dict, List, Tuple + +import datasets + +from .hub import OpenAssistantConfig, features + +_CITATION = """\ +@article{ontocord2023sodasynth, + author = {ontocord and Jeffrey Quesnelle}, + title = {SODA Synthetic Dialogue}, + year = {2023} +} +""" +_DATASETNAME = "soda_synthetic_dialogue" +_DISPLAYNAME = "🥤SODA Synthetic Dialogue" +_DESCRIPTION = "A set of dialogues synthesized from the SODA dataset." +_HOMEPAGE = "" +_LICENSE = "mit" +_URLS = { + _DATASETNAME: {"train": "./data/train.jsonl", "test": "./data/test.jsonl", "validation": "./data/validation.jsonl"} +} +_SUPPORTED_TASKS = ["dialogue-modeling"] +_VERSION = "1.0.0" + + +class SODASyntheticDialogueDataset(datasets.GeneratorBasedBuilder): + """A set of dialogues synthesized from the SODA dataset.""" + + VERSION = datasets.Version(_VERSION) + + BUILDER_CONFIGS = [ + OpenAssistantConfig( + name=f"{_DATASETNAME}_dialogue_modeling", + version=VERSION, + description=f"OpenAssistant dataset config for {_DATASETNAME}", + schema="dialogue_modeling", + subset_id=_DATASETNAME, + ) + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_dialogue_modeling" + + def _info(self) -> datasets.DatasetInfo: + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + + urls = _URLS[_DATASETNAME] + data_dir = dl_manager.download_and_extract(urls) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={"filepath": data_dir, "split": "train"}, + ), + datasets.SplitGenerator( + name=datasets.Split.TEST, + gen_kwargs={"filepath": data_dir, "split": "test"}, + ), + datasets.SplitGenerator( + name=datasets.Split.VALIDATION, + gen_kwargs={"filepath": data_dir, "split": "validation"}, + ), + ] + + def _generate_examples(self, filepath, split: str) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + + if self.config.schema == "dialogue_modeling": + key = 0 + with open(filepath[split], "r", encoding="utf8") as data: + while True: + line = data.readline() + if not line: + return + yield key, json.loads(line) + key += 1