Fix style

2026-06-27 16:10:30 +08:00 · 2023-01-04 13:45:05 +11:00
parent 44667f4e93
commit ca8d3c8f8d
10 changed files with 275 additions and 8 deletions
@@ -1,4 +1,4 @@
-exclude: "build|stubs|^bot/templates/"
+exclude: "build|stubs|^bot/templates/|openassistant/templates"

 default_language_version:
  python: python3
@@ -11,11 +11,13 @@ and then running:
 ```python
 from datasets import load_dataset

-dataset = load_dataset("OpenAssistant/<dataset-name>")
+dataset = load_dataset("OpenAssistant/{dataset-name}")
 ```

 See the instructions below if you'd like to contribute a new dataset to the project.

+## Adding a new dataset
+
 ## Uploading a dataset to the Hugging Face Hub

 Adding a new dataset for the OpenAssistant project typically involves the following steps:
@@ -29,8 +31,8 @@ Adding a new dataset for the OpenAssistant project typically involves the follow

 To upload a dataset to the OpenAssistant organization, you first need to:

-* Create a [Hugging Face account](https://huggingface.co/join) (it's free)
-* Join the [OpenAssistant organization](https://huggingface.co/OpenAssistant) by clicking on the _Request to join this org_ button on the top right-hand side
+- Create a [Hugging Face account](https://huggingface.co/join) (it's free)
+- Join the [OpenAssistant organization](https://huggingface.co/OpenAssistant) by clicking on the _Request to join this org_ button on the top right-hand side

 By default, your [role](https://huggingface.co/docs/hub/organizations-security#access-control-in-organizations) in the organization is `contributor`, which gives you write access to any datasets that you create (and only those). If you'd like to make changes to other datasets, [open a discussion or Hub pull request](https://huggingface.co/docs/hub/repositories-pull-requests-discussions).

@@ -76,8 +78,8 @@ python -m pip install datasets

 We've created a [Gradio application](https://huggingface.co/spaces/OpenAssistant/dataset-generator) on Hugging Face Spaces that will create a new dataset repository for you with the following template files:

-* A dataset loading script
-* A dataset card
+- A dataset loading script
+- A dataset card

 Simply provide the name of the new dataset and your access token from Step 1, and you're good to go!

@@ -97,5 +99,4 @@ from datasets import load_dataset
 load_dataset("OpenAssistant/my_dataset")
 ```

-Congratulations - you've now added a dataset to the Hub!
-
+Congratulations - you've now added a dataset to the OpenAssistant org!
@@ -0,0 +1,31 @@
+# Dataset instructions for {dataset_name}
+
+## Setup
+
+```bash
+python -m pip install -r requirements.txt
+```
+
+## Download
+
+```bash
+python download.py
+```
+
+## Preprocess
+
+```bash
+python preprocess.py
+```
+
+## Load
+
+```bash
+python load.py
+```
+
+## Upload
+
+```bash
+python upload.py
+```
@@ -0,0 +1,2 @@
+datasets>=2.8
+typer
@@ -0,0 +1,206 @@
+# -*- coding: utf-8 -*-
+# Copyright 2023 The OpenAssistant Authors and the current dataset script contributor.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+This template serves as a starting point for contributing a dataset to the OpenAssistant repo.
+
+When modifying it for your dataset, look for TODO items that offer specific instructions.
+
+To create a dataset loading script you will create a class and implement 3 methods:
+  * `_info`: Establishes the schema for the dataset, and returns a datasets.DatasetInfo object.
+  * `_split_generators`: Downloads and extracts data for each split (e.g. train/val/test) or associates local data with each split.
+  * `_generate_examples`: Creates examples from data on disk that conform to each schema defined in `_info`.
+
+Full documentation on writing dataset loading scripts can be found here:
+https://huggingface.co/docs/datasets/dataset_script
+
+This template is adapted from the one provided by BigScience's BigBIO library:
+https://github.com/bigscience-workshop/biomedical/blob/main/templates/template.py
+
+TODO: Before submitting your script, delete this docstring and replace it with a description of your dataset.
+"""
+
+import os
+from typing import Dict, List, Tuple
+
+import datasets
+
+from openassistant.utils.configs import OpenAssistantConfig
+
+# TODO: import the schema that fits your dataset:
+from openassistant.utils.schemas import
+
+# TODO: Add BibTeX citation where appropriate
+_CITATION = """\
+@article{,
+  author    = {},
+  title     = {},
+  journal   = {},
+  volume    = {},
+  year      = {},
+  url       = {},
+  doi       = {},
+  biburl    = {},
+  bibsource = {}
+}
+"""
+
+# TODO: create a module level variable with your dataset name (should match the script name)
+#  E.g. The Pile: [dataset_name] --> the_pile
+_DATASETNAME = "[dataset_name]"
+# TODO: create a pretty display name for your dataset
+_DISPLAYNAME = "Dataset Name"
+
+# TODO: Add a description of the dataset here
+# You can copy an official description
+_DESCRIPTION = """\
+This dataset is designed for XXX NLP task.
+"""
+
+# TODO: Add a link to an official homepage for the dataset here (if possible)
+_HOMEPAGE = ""
+
+# TODO: Add the licence for the dataset here (if possible)
+# Note that this doesn't have to be a common open source license.
+# Some datasets have custom licenses. In this case, simply put the full license terms
+# into `_LICENSE`
+_LICENSE = ""
+
+# TODO: Add links to the URLs needed to download your dataset files.
+# This variable can be a relative path for datasets whose files need to be
+# manually downloaded or preprocessed in advance.
+
+# For publicly available datasets you will most likely end up passing these URLs to dl_manager in _split_generators.
+# However, if you need to access different files for each config you can have multiple entries in this dict.
+# This can be an arbitrarily nested dict/list of URLs (see below in `_split_generators` method)
+_URLS = {
+    _DATASETNAME: "url or list of urls or relative path like ./data ",
+}
+
+# TODO: add supported task by dataset. One dataset may support multiple tasks
+_SUPPORTED_TASKS = []  # example: [Tasks.TRANSLATION, Tasks.NAMED_ENTITY_RECOGNITION, Tasks.RELATION_EXTRACTION]
+
+# TODO: set this to a version that is associated with the dataset. if none exists use "1.0.0"
+#  This version doesn't have to be consistent with semantic versioning. Anything that is
+#  provided by the original dataset as a version goes.
+_VERSION = ""
+
+
+# TODO: Name the dataset class to match the script name using CamelCase instead of snake_case
+#  Append "Dataset" to the class name: ThePile --> ThePileDataset
+class NewDataset(datasets.GeneratorBasedBuilder):
+    """TODO: Short description of my dataset."""
+
+    VERSION = datasets.Version(_VERSION)
+
+    # You will be able to load each dataset with
+    # dataset = datasets.load_dataset('my_dataset')
+
+    # TODO: For each dataset, implement a config for each subset;
+    #  If a dataset contains more than one subset, implement a config for EACH of them.
+    #  Each of them should contain:
+    #   - name: should be unique for each dataset config eg. the_pile_[schema_name]
+    #   - version: VERSION
+    #   - description: one line description for the dataset
+    #   - schema: open_assistant_[schema_name]
+    #   - subset_id: subset id is the canonical name for the dataset (eg. the_pile)
+    #  where [schema_name] = (language_modeling)
+
+    BUILDER_CONFIGS = [
+        OpenAssistantConfig(
+            name=f"{_DATASETNAME}_[schema_name]",
+            version=VERSION,
+            description=f"OpenAssistant dataset config for {_DATASETNAME}",
+            schema_name="[schema_name]",
+            subset_id=_DATASETNAME,
+        )
+    ]
+
+    DEFAULT_CONFIG_NAME = _DATASETNAME
+
+    def _info(self) -> datasets.DatasetInfo:
+        # TODO: Implement the schema for your dataset here.
+        raise NotImplementedError()
+
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION,
+            features=features,
+            homepage=_HOMEPAGE,
+            license=_LICENSE,
+            citation=_CITATION,
+        )
+
+    def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]:
+        """Returns SplitGenerators."""
+        # TODO: This method is tasked with downloading/extracting the data and defining the splits depending on the configuration
+
+        # If you need to access a config choice, that will be in self.config.name
+
+        # dl_manager is a datasets.download.DownloadManager that can be used to download and extract URLs; many examples use the download_and_extract method; see the DownloadManager docs here: https://huggingface.co/docs/datasets/package_reference/builder_classes.html#datasets.DownloadManager
+
+        # dl_manager can accept any type of nested list/dict and will give back the same structure with the url replaced with the path to local files.
+
+        urls = _URLS[_DATASETNAME]
+        data_dir = dl_manager.download_and_extract(urls)
+
+        # Not all datasets have predefined canonical train/val/test splits.
+        # If your dataset has no predefined splits, use datasets.Split.TRAIN for all of the data.
+
+        return [
+            datasets.SplitGenerator(
+                name=datasets.Split.TRAIN,
+                # Whatever you put in gen_kwargs will be passed to _generate_examples
+                gen_kwargs={
+                    "filepath": os.path.join(data_dir, "train.jsonl"),
+                    "split": "train",
+                },
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.TEST,
+                gen_kwargs={
+                    "filepath": os.path.join(data_dir, "test.jsonl"),
+                    "split": "test",
+                },
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.VALIDATION,
+                gen_kwargs={
+                    "filepath": os.path.join(data_dir, "dev.jsonl"),
+                    "split": "dev",
+                },
+            ),
+        ]
+
+    # method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
+
+    # TODO: change the args of this function to match the keys in `gen_kwargs`. You may add any necessary kwargs.
+
+    def _generate_examples(self, filepath, split: str) -> Tuple[int, Dict]:
+        """Yields examples as (key, example) tuples."""
+        # TODO: This method handles input defined in _split_generators to yield (key, example) tuples from the dataset.
+
+        # The `key` is for legacy reasons (tfds) and is not important in itself, but must be unique for each example.
+
+        # NOTE: For local datasets you will have access to self.config.data_dir and self.config.data_files
+
+        if self.config.schema == "[schema_name]":
+            # TODO: yield (key, example) tuples in the given schema
+            for key, example in thing:
+                yield key, example
+
+# This allows you to run your dataloader with `python [dataset_name].py` during development
+# TODO: Remove this before making your PR
+if __name__ == "__main__":
+    datasets.load_dataset(__file__)
@@ -0,0 +1,15 @@
+# -*- coding: utf-8 -*-
+from dataclasses import dataclass
+
+import datasets
+
+
+@dataclass
+class OpenAssistantConfig(datasets.BuilderConfig):
+    """BuilderConfig for OpenAssistant datasets."""
+
+    name: str = None
+    version: datasets.Version = None
+    description: str = None
+    schema: str = None
+    subset_id: str = None
@@ -0,0 +1,12 @@
+# -*- coding: utf-8 -*-
+"""
+Language Modeling Schema
+"""
+import datasets
+
+features = datasets.Features(
+    {
+        "text": datasets.Value("string"),
+        "meta": [datasets.Value("string")],
+    }
+)