diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 27a6511d..0bb81e89 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -26,7 +26,7 @@ # # /WARNING! -exclude: build|stubs|^bot/templates/$ +exclude: build|stubs|^bot/templates/$|openassistant/templates repos: - repo: https://github.com/pre-commit/pre-commit-hooks diff --git a/docs/datasets.md b/docs/datasets.md new file mode 100644 index 00000000..ab039871 --- /dev/null +++ b/docs/datasets.md @@ -0,0 +1,421 @@ +# OpenAssistant Datasets + +The datasets for this project are currently hosted as loading scripts on the +[Open-Assistant organization](https://huggingface.co/OpenAssistant) the Hugging +Face Hub. Each of them can be loaded by first installing the 🤗 Datasets +library: + +```bash +python -m pip install datasets +``` + +and then running: + +```python +from datasets import load_dataset + +dataset = load_dataset("OpenAssistant/{dataset-name}") +``` + +We use this GitHub repository to accept new submissions and standardize quality +control. See the instructions below if you'd like to contribute a new dataset to +the project. + +## Adding a new dataset + +### 0. Pre-Requisites + +Install Git and create a GitHub account prior to implementing a dataset; you can +follow instructions to install Git +[here](https://git-scm.com/book/en/v2/Getting-Started-Installing-Git). + +You will also need at least Python 3.8+. If you are installing Python, we +recommend downloading +[Anaconda](https://docs.anaconda.com/anaconda/install/index.html) to curate a +python environment with necessary packages. **We strongly recommend Python 3.8+ +for stability**. + +### 1. **Fork the OpenAssistant repository** + +Fork the +`OpenAssistant`[repository](https://github.com/LAION-AI/Open-Assistant). To do +this, click the link to the repository and click "Fork" in the upper-right +corner. You should get an option to fork to your account, provided you are +signed into Github. + +After you fork, clone the repository locally. You can do so as follows: + +```bash +git clone git@github.com:/OpenAssistant.git +cd OpenAssistant # enter the directory +``` + +Next, you want to set your `upstream` location to enable you to push/pull (add +or receive updates). You can do so as follows: + +```bash +git remote add upstream git@github.com:LAION-AI/Open-Assistant.git +``` + +You can optionally check that this was set properly by running the following +command: + +```bash +git remote -v +``` + +The output of this command should look as follows: + +```bash +origin git@github.com:/Open-Assistant.git (fetch) +origin git@github.com:/Open-Assistant.git (push) +upstream git@github.com:LAION-AI/Open-Assistant.git (fetch) +upstream git@github.com:LAION-AI/Open-Assistant.git (push) +``` + +If you do NOT have an `origin` for whatever reason, then run: + +```bash +git remote add origin git@github.com:/OpenAssistant.git +``` + +The goal of `upstream` is to keep your repository up-to-date to any changes that +are made officially to the OpenAssistant repo. You can do this as follows by +running the following commands: + +``` +git fetch upstream +git pull +``` + +Provided you have no _merge conflicts_, this will ensure the repo stays +up-to-date as you make changes. However, before you make changes, you should +make a custom branch to implement your changes. + +You can make a new branch as such: + +``` +git checkout -b +``` + +

Please do not make changes on the master branch!

+ +Always make sure you're on the right branch with the following command: + +``` +git branch +``` + +The correct branch will have a asterisk \* in front of it. + +### 2. **Create a development environment** + +You can make an environment in any way you choose to. We highlight two possible +options: + +#### 2a) Create a conda environment + +The following instructions will create an Anaconda `openassistant` environment. + +- Install [anaconda](https://docs.anaconda.com/anaconda/install/) for your + appropriate operating system. +- Run the following command while in the `biomedical` folder (you can pick your + python version): + +```bash +conda create -n openassistant python=3.8 # Creates a conda env +conda activate openassistant # Activate your conda environment +cd openassistant +pip install -r dev-requirements.txt # Install this while in the openassistant folder +``` + +You can deactivate your environment at any time by either exiting your terminal +or using `conda deactivate`. + +#### 2b) Create a venv environment + +Python 3.3+ has venv automatically installed; official information is found +[here](https://packaging.python.org/en/latest/guides/installing-using-pip-and-virtual-environments/). + +``` +python3 -m venv +source /bin/activate # activate environment +cd openassistant +pip install -r dev-requirements.txt # Install this while in the openassistant folder +``` + +Make sure your `pip` package points to your environment's source. + +### 3. Prepare a folder in `datasets` for your dataloader + +Make a new directory within the `openassistant/datasets` directory: + +```bash +mkdir openassistant/datasets/ +``` + +**NOTE**: Please use snake_case, i.e. lowercase letters and underscores when +choosing a ``. + +Add an `__init__.py` file to this directory: + +```bash +touch openassistant/datasets//__init__.py +``` + +Next, copy the `template.py` script and `hub.py` module of `templates` into your +dataset folder. The `template.py` script has "TODOs" to fill in for your +dataloading script: + +```bash +cp templates/hub.py openassistant/datasets// +cp templates/template.py openassistant/datasets//.py +``` + +#### (Optional) Prepare local dataset files + +If your dataset files aren't publicly available via URLs (e.g. because you +implemented a web scraper), you'll need to implement some extra logic to store +and prepare the data locally prior to implementing a loading script in 🤗 +Datasets. + +To do so, first copy the template script for dataset creation: + +```bash +cp templates/prepare.py openassistant/datasets// +``` + +Next, implement any logic that is needed to prepare a local version of the +dataset files (by convention we store them in `datasets//data/`). +Add any extra dependencies to a `requirements.txt` file and provide instructions +on how to prepare the dataset files in a README: + +```bash +touch openassistant/datasets//requirements.txt +cp templates/README.py openassistant/datasets// +``` + +**Note:** Do not commit any dataset files to the OpenAssistant repo - all data +will be hosted on the Hugging Face Hub. This step is needed for the project's +data admins to be able to replicate the dataset creation process before pushing +to the Hub. + +### 4. Implement your dataset + +To implement your dataloader, you will need to follow `template.py` and fill in +all necessary TODOs. There are three key methods that are important: + +- `_info`: Specifies the schema of the expected dataloader +- `_split_generators`: Downloads and extracts data for each split (e.g. + train/val/test) or associate local data with each split. +- `_generate_examples`: Create examples from data that conform to each schema + defined in `_info`. + +For the `_info_` function, you will need to define `features` for your +`DatasetInfo` object. For each dataset config, choose the right schema from our +list of examples. You can find the schemas in the +[schemas directory](openassistant/utils/schemas/). + +You will use this schema in the `_generate_examples` return value. + +Populate the information in the dataset according to this schema; some fields +may be empty. + +#### Example scripts + +TODO + +#### Running & debugging + +You can run your data loader script during development by appending the +following statement to your code ([templates/template.py](templates/template.py) +already includes this): + +```python +if __name__ == "__main__": + datasets.load_dataset(__file__) +``` + +If you want to use an interactive debugger during development, you will have to +use `breakpoint()` instead of setting breakpoints directly in your IDE. Most +IDEs will recognize the `breakpoint()` statement and pause there during +debugging. If your preferred IDE doesn't support this, you can always run the +script in your terminal and debug with `pdb`. + +### 5. Check if your dataloader works + +Make sure your dataset is implemented correctly by checking in python the +following commands: + +```python +from datasets import load_dataset + +data = load_dataset("openassistant/datasets//.py", name="_") +``` + +Run these commands from the top level of the `OpenAssistant` repo. + +### 6. Create a dataset card + +Copy and fill out the template dataset card: + +```bash +cp templates/dataset_card.md openassistant/datasets//README.md +``` + +### 7. Format your code + +From the main directory, run the code quality checks via the following command: + +``` +pre-commit run --all-files +``` + +This runs the black formatter, isort, and lints to ensure that the code is +readable and looks nice. Flake8 linting errors may require manual changes. + +### 8. Commit your changes + +First, commit your changes to the branch to "add" the work: + +``` +git add openassistant/datasets//*.py +git commit -m "A message describing your commits" +``` + +Then, run the following commands to incorporate any new changes in the master +branch of datasets as follows: + +``` +git fetch upstream +git rebase upstream/main +``` + +**Run these commands in your custom branch**. + +Push these changes to **your fork** with the following command: + +``` +git push -u origin +``` + +### 9. **Make a pull request** + +Make a Pull Request to implement your changes on the main repository +[here](https://github.com/LAION-AI/Open-Assistant/pulls). To do so, click "New +Pull Request". Then, choose your branch from your fork to push into "base:main". + +When opening a PR, please link the +[issue](https://github.com/LAION-AI/Open-Assistant/issues) corresponding to your +dataset using +[closing keywords](https://docs.github.com/en/issues/tracking-your-work-with-issues/linking-a-pull-request-to-an-issue) +in the PR's description, e.g. `resolves #17`. + +## [Admins] Uploading a dataset to the Hugging Face Hub + +Uploading a new dataset from `openassistant/datasets/` to the +Hugging Face Hub typically involves the following steps: + +1. Setup +2. Create a new dataset repository +3. Copy a dataset loading script and dataset card +4. Upload to the Hub + +### 1. Setup + +To upload a dataset to the OpenAssistant organization, you first need to: + +- Create a [Hugging Face account](https://huggingface.co/join) (it's free) +- Join the [OpenAssistant organization](https://huggingface.co/OpenAssistant) by + clicking on the _Request to join this org_ button on the top right-hand side + +Next, check that you're correctly logged in and that `git-lfs` is installed so +that the dataset can be uploaded. To log in, create a **write access token** +that can be found under your Hugging Face profile (icon in the top right corner +on [hf.co](http://hf.co/), then Settings -> Access Tokens -> User Access Tokens +-> New Token. Alternatively, you can go to +[your token settings](https://huggingface.co/settings/tokens) directly. + +Once you've created a token, run: + +```bash +huggingface-cli login +``` + +in a terminal, or case you're working in a notebook + +```python +from huggingface_hub import notebook_login + +notebook_login() +``` + +You can then copy-paste your token to log in locally. + +Next, let's make sure that `git-lfs` is correctly installed. To do so, simply +run: + +```bash +git-lfs -v +``` + +The output should show something like +`git-lfs/2.13.2 (GitHub; linux amd64; go 1.15.4)`. If your console states that +the `git-lfs` command was not found, please make sure to install it +[here](https://git-lfs.github.com/) or simply via: + +```bash +sudo apt-get install git-lfs +git config --global user.email "you@example.com" +git config --global user.name "Your Name" +``` + +The final step of the setup is to install the 🤗 Datasets library by running: + +```bash +python -m pip install datasets +``` + +### 2. Create a new dataset repository + +Follow [this guide](https://huggingface.co/docs/datasets/upload_dataset) for +instructions on creating a new dataset repo on the Hub. Use the same snake_case +name as the dataset in `openassistant/datasets/`. + +Once you've created the dataset repo, clone it by running: + +```bash +git clone https://huggingface.co/datasets/OpenAssistant/ +cd +``` + +### 3. Copy a dataset loading script and dataset card + +Next, copy the loading script and dataset card to your repo: + +```bash +cp openassistant/datasets//.py . +cp openassistant/datasets//README.md . +``` + +#### (Optional) Prepare local dataset files + +If the dataset files of `openassistant/datasets/` aren't public, +you'll need to run the `openassistant/datasets//prepare.py` script +to create them. Store them in the same directory that is specified by the +loading script (`data` by default). + +### 4. Upload to the Hub + +Once the dataset script and card are ready, use Git to push them to the Hub +(along with any data files you may need). + +At this point, you can load the dataset by running: + +```python +from datasets import load_dataset + +load_dataset("OpenAssistant/{dataset_name}") +``` + +Congratulations - you've now added a dataset to the OpenAssistant org! diff --git a/openassistant/__init__.py b/openassistant/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/openassistant/dev-requirements.txt b/openassistant/dev-requirements.txt new file mode 100644 index 00000000..fe709d0e --- /dev/null +++ b/openassistant/dev-requirements.txt @@ -0,0 +1 @@ +datasets>=2.8,<3.0.0 diff --git a/openassistant/templates/README.md b/openassistant/templates/README.md new file mode 100644 index 00000000..b174c47e --- /dev/null +++ b/openassistant/templates/README.md @@ -0,0 +1,10 @@ +# Dataset preparation instructions for {dataset_name} + +## Setup + +Add any installation details here. + +## Usage + +Explain how to run any scripts that involve preparing local dataset files, e.g. +if the dataset files aren't public or are produced by a web scraper. diff --git a/openassistant/templates/dataset_card.md b/openassistant/templates/dataset_card.md new file mode 100644 index 00000000..76736d8f --- /dev/null +++ b/openassistant/templates/dataset_card.md @@ -0,0 +1,28 @@ +--- +license: mit +tags: +- open-assistant +- human-feedback +- dialogue-modeling +- language-modeling +--- + +# Dataset card for {dataset_name} + +This is a dataset card template for the [LAION-AI OpenAssistant project](https://github.com/LAION-AI/Open-Assistant). Fill out this template when adding a new dataset to the Hugging Face Hub. + +## Dataset summary + +[More information needed](https://github.com/huggingface/datasets/blob/main/CONTRIBUTING.md#how-to-contribute-to-the-dataset-cards) + +## Usage + +[More information needed](https://github.com/huggingface/datasets/blob/main/CONTRIBUTING.md#how-to-contribute-to-the-dataset-cards) + +## Source data + +[More information needed](https://github.com/huggingface/datasets/blob/main/CONTRIBUTING.md#how-to-contribute-to-the-dataset-cards) + +## Citation + +[More information needed](https://github.com/huggingface/datasets/blob/main/CONTRIBUTING.md#how-to-contribute-to-the-dataset-cards) \ No newline at end of file diff --git a/openassistant/templates/hub.py b/openassistant/templates/hub.py new file mode 100644 index 00000000..49194e31 --- /dev/null +++ b/openassistant/templates/hub.py @@ -0,0 +1,22 @@ +from dataclasses import dataclass + +import datasets + + +@dataclass +class OpenAssistantConfig(datasets.BuilderConfig): + """BuilderConfig for OpenAssistant datasets.""" + + name: str = None + version: datasets.Version = None + description: str = None + schema: str = None + subset_id: str = None + + +lm_features = datasets.Features( + { + "text": datasets.Value("string"), + "meta": [datasets.Value("string")], + } +) diff --git a/openassistant/templates/prepare.py b/openassistant/templates/prepare.py new file mode 100644 index 00000000..83a6b15b --- /dev/null +++ b/openassistant/templates/prepare.py @@ -0,0 +1,8 @@ +import typer + +def main(output_dir: str = "data"): + """Download and prepare the dataset for use.""" + raise NotImplementedError + +if __name__ == "__main__": + typer.run(main) \ No newline at end of file diff --git a/openassistant/templates/template.py b/openassistant/templates/template.py new file mode 100644 index 00000000..391df55f --- /dev/null +++ b/openassistant/templates/template.py @@ -0,0 +1,205 @@ +# Copyright 2023 The OpenAssistant Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +This template serves as a starting point for contributing a dataset to the OpenAssistant repo. + +When modifying it for your dataset, look for TODO items that offer specific instructions. + +To create a dataset loading script you will create a class and implement 3 methods: + * `_info`: Establishes the schema for the dataset, and returns a datasets.DatasetInfo object. + * `_split_generators`: Downloads and extracts data for each split (e.g. train/val/test) or associates local data with each split. + * `_generate_examples`: Creates examples from data on disk that conform to each schema defined in `_info`. + +Full documentation on writing dataset loading scripts can be found here: +https://huggingface.co/docs/datasets/dataset_script + +This template is adapted from the one provided by BigScience's BigBIO library: +https://github.com/bigscience-workshop/biomedical/blob/main/templates/template.py + +TODO: Before submitting your script, delete this docstring and replace it with a description of your dataset. +""" + +import os +from typing import Dict, List, Tuple + +import datasets + +from .hub import OpenAssistantConfig + +# TODO: import the schema (i.e. features) that fits your dataset: +from .hub import + +# TODO: Add BibTeX citation where appropriate +_CITATION = """\ +@article{, + author = {}, + title = {}, + journal = {}, + volume = {}, + year = {}, + url = {}, + doi = {}, + biburl = {}, + bibsource = {} +} +""" + +# TODO: create a module level variable with your dataset name (should match the script name) +# E.g. The Pile: [dataset_name] --> the_pile +_DATASETNAME = "[dataset_name]" +# TODO: create a pretty display name for your dataset +_DISPLAYNAME = "Dataset Name" + +# TODO: Add a description of the dataset here +# You can copy an official description +_DESCRIPTION = """\ +This dataset is designed for XXX NLP task. +""" + +# TODO: Add a link to an official homepage for the dataset here (if possible) +_HOMEPAGE = "" + +# TODO: Add the licence for the dataset here (if possible) +# Note that this doesn't have to be a common open source license. +# Some datasets have custom licenses. In this case, simply put the full license terms +# into `_LICENSE` +_LICENSE = "" + +# TODO: Add links to the URLs needed to download your dataset files. +# This variable can be a relative path for datasets whose files need to be +# manually downloaded or preprocessed in advance. + +# For publicly available datasets you will most likely end up passing these URLs to dl_manager in _split_generators. +# However, if you need to access different files for each config you can have multiple entries in this dict. +# This can be an arbitrarily nested dict/list of URLs (see below in `_split_generators` method) +_URLS = { + _DATASETNAME: "url or list of urls or relative path like ./data ", +} + +# TODO: add supported task by dataset. One dataset may support multiple tasks +_SUPPORTED_TASKS = [] # example: [Tasks.TRANSLATION, Tasks.NAMED_ENTITY_RECOGNITION, Tasks.RELATION_EXTRACTION] + +# TODO: set this to a version that is associated with the dataset. if none exists use "1.0.0" +# This version doesn't have to be consistent with semantic versioning. Anything that is +# provided by the original dataset as a version goes. +_VERSION = "" + + +# TODO: Name the dataset class to match the script name using CamelCase instead of snake_case +# Append "Dataset" to the class name: ThePile --> ThePileDataset +class NewDataset(datasets.GeneratorBasedBuilder): + """TODO: Short description of my dataset.""" + + VERSION = datasets.Version(_VERSION) + + # You will be able to load each dataset with + # dataset = datasets.load_dataset('my_dataset') + + # TODO: For each dataset, implement a config for each subset; + # If a dataset contains more than one subset, implement a config for EACH of them. + # Each of them should contain: + # - name: should be unique for each dataset config eg. the_pile_[schema_name] + # - version: VERSION + # - description: one line description for the dataset + # - schema: open_assistant_[schema_name] + # - subset_id: subset id is the canonical name for the dataset (eg. the_pile) + # where [schema_name] = (language_modeling) + + BUILDER_CONFIGS = [ + OpenAssistantConfig( + name=f"{_DATASETNAME}_[schema_name]", + version=VERSION, + description=f"OpenAssistant dataset config for {_DATASETNAME}", + schema_name="[schema_name]", + subset_id=_DATASETNAME, + ) + ] + + DEFAULT_CONFIG_NAME = _DATASETNAME + + def _info(self) -> datasets.DatasetInfo: + # TODO: Implement the schema for your dataset here. + raise NotImplementedError() + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + # TODO: This method is tasked with downloading/extracting the data and defining the splits depending on the configuration + + # If you need to access a config choice, that will be in self.config.name + + # dl_manager is a datasets.download.DownloadManager that can be used to download and extract URLs; many examples use the download_and_extract method; see the DownloadManager docs here: https://huggingface.co/docs/datasets/package_reference/builder_classes.html#datasets.DownloadManager + + # dl_manager can accept any type of nested list/dict and will give back the same structure with the url replaced with the path to local files. + + urls = _URLS[_DATASETNAME] + data_dir = dl_manager.download_and_extract(urls) + + # Not all datasets have predefined canonical train/val/test splits. + # If your dataset has no predefined splits, use datasets.Split.TRAIN for all of the data. + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + # Whatever you put in gen_kwargs will be passed to _generate_examples + gen_kwargs={ + "filepath": os.path.join(data_dir, "train.jsonl"), + "split": "train", + }, + ), + datasets.SplitGenerator( + name=datasets.Split.TEST, + gen_kwargs={ + "filepath": os.path.join(data_dir, "test.jsonl"), + "split": "test", + }, + ), + datasets.SplitGenerator( + name=datasets.Split.VALIDATION, + gen_kwargs={ + "filepath": os.path.join(data_dir, "dev.jsonl"), + "split": "dev", + }, + ), + ] + + # method parameters are unpacked from `gen_kwargs` as given in `_split_generators` + + # TODO: change the args of this function to match the keys in `gen_kwargs`. You may add any necessary kwargs. + + def _generate_examples(self, filepath, split: str) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + # TODO: This method handles input defined in _split_generators to yield (key, example) tuples from the dataset. + + # The `key` is for legacy reasons (tfds) and is not important in itself, but must be unique for each example. + + # NOTE: For local datasets you will have access to self.config.data_dir and self.config.data_files + + if self.config.schema == "[schema_name]": + # TODO: yield (key, example) tuples in the given schema + for key, example in thing: + yield key, example + +# This allows you to run your dataloader with `python [dataset_name].py` during development +# TODO: Remove this before making your PR +if __name__ == "__main__": + datasets.load_dataset(__file__)