diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index de06c7ba..ff34b62c 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,4 +1,4 @@ -exclude: "build|stubs|^bot/templates/" +exclude: "build|stubs|^bot/templates/|openassistant/templates" default_language_version: python: python3 diff --git a/docs/datasets/README.md b/docs/datasets/README.md index e7cb264e..08c5fd13 100644 --- a/docs/datasets/README.md +++ b/docs/datasets/README.md @@ -11,11 +11,13 @@ and then running: ```python from datasets import load_dataset -dataset = load_dataset("OpenAssistant/") +dataset = load_dataset("OpenAssistant/{dataset-name}") ``` See the instructions below if you'd like to contribute a new dataset to the project. +## Adding a new dataset + ## Uploading a dataset to the Hugging Face Hub Adding a new dataset for the OpenAssistant project typically involves the following steps: @@ -29,8 +31,8 @@ Adding a new dataset for the OpenAssistant project typically involves the follow To upload a dataset to the OpenAssistant organization, you first need to: -* Create a [Hugging Face account](https://huggingface.co/join) (it's free) -* Join the [OpenAssistant organization](https://huggingface.co/OpenAssistant) by clicking on the _Request to join this org_ button on the top right-hand side +- Create a [Hugging Face account](https://huggingface.co/join) (it's free) +- Join the [OpenAssistant organization](https://huggingface.co/OpenAssistant) by clicking on the _Request to join this org_ button on the top right-hand side By default, your [role](https://huggingface.co/docs/hub/organizations-security#access-control-in-organizations) in the organization is `contributor`, which gives you write access to any datasets that you create (and only those). If you'd like to make changes to other datasets, [open a discussion or Hub pull request](https://huggingface.co/docs/hub/repositories-pull-requests-discussions). @@ -76,8 +78,8 @@ python -m pip install datasets We've created a [Gradio application](https://huggingface.co/spaces/OpenAssistant/dataset-generator) on Hugging Face Spaces that will create a new dataset repository for you with the following template files: -* A dataset loading script -* A dataset card +- A dataset loading script +- A dataset card Simply provide the name of the new dataset and your access token from Step 1, and you're good to go! @@ -97,5 +99,4 @@ from datasets import load_dataset load_dataset("OpenAssistant/my_dataset") ``` -Congratulations - you've now added a dataset to the Hub! - +Congratulations - you've now added a dataset to the OpenAssistant org! diff --git a/openassistant/__init__.py b/openassistant/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/openassistant/templates/README.md b/openassistant/templates/README.md new file mode 100644 index 00000000..b7899eb7 --- /dev/null +++ b/openassistant/templates/README.md @@ -0,0 +1,31 @@ +# Dataset instructions for {dataset_name} + +## Setup + +```bash +python -m pip install -r requirements.txt +``` + +## Download + +```bash +python download.py +``` + +## Preprocess + +```bash +python preprocess.py +``` + +## Load + +```bash +python load.py +``` + +## Upload + +```bash +python upload.py +``` diff --git a/openassistant/templates/requirements.txt b/openassistant/templates/requirements.txt new file mode 100644 index 00000000..74fb5166 --- /dev/null +++ b/openassistant/templates/requirements.txt @@ -0,0 +1,2 @@ +datasets>=2.8 +typer diff --git a/openassistant/templates/template.py b/openassistant/templates/template.py new file mode 100644 index 00000000..b6c52871 --- /dev/null +++ b/openassistant/templates/template.py @@ -0,0 +1,206 @@ +# -*- coding: utf-8 -*- +# Copyright 2023 The OpenAssistant Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +This template serves as a starting point for contributing a dataset to the OpenAssistant repo. + +When modifying it for your dataset, look for TODO items that offer specific instructions. + +To create a dataset loading script you will create a class and implement 3 methods: + * `_info`: Establishes the schema for the dataset, and returns a datasets.DatasetInfo object. + * `_split_generators`: Downloads and extracts data for each split (e.g. train/val/test) or associates local data with each split. + * `_generate_examples`: Creates examples from data on disk that conform to each schema defined in `_info`. + +Full documentation on writing dataset loading scripts can be found here: +https://huggingface.co/docs/datasets/dataset_script + +This template is adapted from the one provided by BigScience's BigBIO library: +https://github.com/bigscience-workshop/biomedical/blob/main/templates/template.py + +TODO: Before submitting your script, delete this docstring and replace it with a description of your dataset. +""" + +import os +from typing import Dict, List, Tuple + +import datasets + +from openassistant.utils.configs import OpenAssistantConfig + +# TODO: import the schema that fits your dataset: +from openassistant.utils.schemas import + +# TODO: Add BibTeX citation where appropriate +_CITATION = """\ +@article{, + author = {}, + title = {}, + journal = {}, + volume = {}, + year = {}, + url = {}, + doi = {}, + biburl = {}, + bibsource = {} +} +""" + +# TODO: create a module level variable with your dataset name (should match the script name) +# E.g. The Pile: [dataset_name] --> the_pile +_DATASETNAME = "[dataset_name]" +# TODO: create a pretty display name for your dataset +_DISPLAYNAME = "Dataset Name" + +# TODO: Add a description of the dataset here +# You can copy an official description +_DESCRIPTION = """\ +This dataset is designed for XXX NLP task. +""" + +# TODO: Add a link to an official homepage for the dataset here (if possible) +_HOMEPAGE = "" + +# TODO: Add the licence for the dataset here (if possible) +# Note that this doesn't have to be a common open source license. +# Some datasets have custom licenses. In this case, simply put the full license terms +# into `_LICENSE` +_LICENSE = "" + +# TODO: Add links to the URLs needed to download your dataset files. +# This variable can be a relative path for datasets whose files need to be +# manually downloaded or preprocessed in advance. + +# For publicly available datasets you will most likely end up passing these URLs to dl_manager in _split_generators. +# However, if you need to access different files for each config you can have multiple entries in this dict. +# This can be an arbitrarily nested dict/list of URLs (see below in `_split_generators` method) +_URLS = { + _DATASETNAME: "url or list of urls or relative path like ./data ", +} + +# TODO: add supported task by dataset. One dataset may support multiple tasks +_SUPPORTED_TASKS = [] # example: [Tasks.TRANSLATION, Tasks.NAMED_ENTITY_RECOGNITION, Tasks.RELATION_EXTRACTION] + +# TODO: set this to a version that is associated with the dataset. if none exists use "1.0.0" +# This version doesn't have to be consistent with semantic versioning. Anything that is +# provided by the original dataset as a version goes. +_VERSION = "" + + +# TODO: Name the dataset class to match the script name using CamelCase instead of snake_case +# Append "Dataset" to the class name: ThePile --> ThePileDataset +class NewDataset(datasets.GeneratorBasedBuilder): + """TODO: Short description of my dataset.""" + + VERSION = datasets.Version(_VERSION) + + # You will be able to load each dataset with + # dataset = datasets.load_dataset('my_dataset') + + # TODO: For each dataset, implement a config for each subset; + # If a dataset contains more than one subset, implement a config for EACH of them. + # Each of them should contain: + # - name: should be unique for each dataset config eg. the_pile_[schema_name] + # - version: VERSION + # - description: one line description for the dataset + # - schema: open_assistant_[schema_name] + # - subset_id: subset id is the canonical name for the dataset (eg. the_pile) + # where [schema_name] = (language_modeling) + + BUILDER_CONFIGS = [ + OpenAssistantConfig( + name=f"{_DATASETNAME}_[schema_name]", + version=VERSION, + description=f"OpenAssistant dataset config for {_DATASETNAME}", + schema_name="[schema_name]", + subset_id=_DATASETNAME, + ) + ] + + DEFAULT_CONFIG_NAME = _DATASETNAME + + def _info(self) -> datasets.DatasetInfo: + # TODO: Implement the schema for your dataset here. + raise NotImplementedError() + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + # TODO: This method is tasked with downloading/extracting the data and defining the splits depending on the configuration + + # If you need to access a config choice, that will be in self.config.name + + # dl_manager is a datasets.download.DownloadManager that can be used to download and extract URLs; many examples use the download_and_extract method; see the DownloadManager docs here: https://huggingface.co/docs/datasets/package_reference/builder_classes.html#datasets.DownloadManager + + # dl_manager can accept any type of nested list/dict and will give back the same structure with the url replaced with the path to local files. + + urls = _URLS[_DATASETNAME] + data_dir = dl_manager.download_and_extract(urls) + + # Not all datasets have predefined canonical train/val/test splits. + # If your dataset has no predefined splits, use datasets.Split.TRAIN for all of the data. + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + # Whatever you put in gen_kwargs will be passed to _generate_examples + gen_kwargs={ + "filepath": os.path.join(data_dir, "train.jsonl"), + "split": "train", + }, + ), + datasets.SplitGenerator( + name=datasets.Split.TEST, + gen_kwargs={ + "filepath": os.path.join(data_dir, "test.jsonl"), + "split": "test", + }, + ), + datasets.SplitGenerator( + name=datasets.Split.VALIDATION, + gen_kwargs={ + "filepath": os.path.join(data_dir, "dev.jsonl"), + "split": "dev", + }, + ), + ] + + # method parameters are unpacked from `gen_kwargs` as given in `_split_generators` + + # TODO: change the args of this function to match the keys in `gen_kwargs`. You may add any necessary kwargs. + + def _generate_examples(self, filepath, split: str) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + # TODO: This method handles input defined in _split_generators to yield (key, example) tuples from the dataset. + + # The `key` is for legacy reasons (tfds) and is not important in itself, but must be unique for each example. + + # NOTE: For local datasets you will have access to self.config.data_dir and self.config.data_files + + if self.config.schema == "[schema_name]": + # TODO: yield (key, example) tuples in the given schema + for key, example in thing: + yield key, example + +# This allows you to run your dataloader with `python [dataset_name].py` during development +# TODO: Remove this before making your PR +if __name__ == "__main__": + datasets.load_dataset(__file__) diff --git a/openassistant/utils/__init__.py b/openassistant/utils/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/openassistant/utils/configs.py b/openassistant/utils/configs.py new file mode 100644 index 00000000..2391f66b --- /dev/null +++ b/openassistant/utils/configs.py @@ -0,0 +1,15 @@ +# -*- coding: utf-8 -*- +from dataclasses import dataclass + +import datasets + + +@dataclass +class OpenAssistantConfig(datasets.BuilderConfig): + """BuilderConfig for OpenAssistant datasets.""" + + name: str = None + version: datasets.Version = None + description: str = None + schema: str = None + subset_id: str = None diff --git a/openassistant/utils/schemas/__init__.py b/openassistant/utils/schemas/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/openassistant/utils/schemas/language_modeling.py b/openassistant/utils/schemas/language_modeling.py new file mode 100644 index 00000000..0b2072a5 --- /dev/null +++ b/openassistant/utils/schemas/language_modeling.py @@ -0,0 +1,12 @@ +# -*- coding: utf-8 -*- +""" +Language Modeling Schema +""" +import datasets + +features = datasets.Features( + { + "text": datasets.Value("string"), + "meta": [datasets.Value("string")], + } +)