mirror of
https://github.com/wassname/Open-Assistant.git
synced 2026-06-27 16:10:30 +08:00
Fix style
This commit is contained in:
@@ -1,4 +1,4 @@
|
||||
exclude: "build|stubs|^bot/templates/"
|
||||
exclude: "build|stubs|^bot/templates/|openassistant/templates"
|
||||
|
||||
default_language_version:
|
||||
python: python3
|
||||
|
||||
@@ -11,11 +11,13 @@ and then running:
|
||||
```python
|
||||
from datasets import load_dataset
|
||||
|
||||
dataset = load_dataset("OpenAssistant/<dataset-name>")
|
||||
dataset = load_dataset("OpenAssistant/{dataset-name}")
|
||||
```
|
||||
|
||||
See the instructions below if you'd like to contribute a new dataset to the project.
|
||||
|
||||
## Adding a new dataset
|
||||
|
||||
## Uploading a dataset to the Hugging Face Hub
|
||||
|
||||
Adding a new dataset for the OpenAssistant project typically involves the following steps:
|
||||
@@ -29,8 +31,8 @@ Adding a new dataset for the OpenAssistant project typically involves the follow
|
||||
|
||||
To upload a dataset to the OpenAssistant organization, you first need to:
|
||||
|
||||
* Create a [Hugging Face account](https://huggingface.co/join) (it's free)
|
||||
* Join the [OpenAssistant organization](https://huggingface.co/OpenAssistant) by clicking on the _Request to join this org_ button on the top right-hand side
|
||||
- Create a [Hugging Face account](https://huggingface.co/join) (it's free)
|
||||
- Join the [OpenAssistant organization](https://huggingface.co/OpenAssistant) by clicking on the _Request to join this org_ button on the top right-hand side
|
||||
|
||||
By default, your [role](https://huggingface.co/docs/hub/organizations-security#access-control-in-organizations) in the organization is `contributor`, which gives you write access to any datasets that you create (and only those). If you'd like to make changes to other datasets, [open a discussion or Hub pull request](https://huggingface.co/docs/hub/repositories-pull-requests-discussions).
|
||||
|
||||
@@ -76,8 +78,8 @@ python -m pip install datasets
|
||||
|
||||
We've created a [Gradio application](https://huggingface.co/spaces/OpenAssistant/dataset-generator) on Hugging Face Spaces that will create a new dataset repository for you with the following template files:
|
||||
|
||||
* A dataset loading script
|
||||
* A dataset card
|
||||
- A dataset loading script
|
||||
- A dataset card
|
||||
|
||||
Simply provide the name of the new dataset and your access token from Step 1, and you're good to go!
|
||||
|
||||
@@ -97,5 +99,4 @@ from datasets import load_dataset
|
||||
load_dataset("OpenAssistant/my_dataset")
|
||||
```
|
||||
|
||||
Congratulations - you've now added a dataset to the Hub!
|
||||
|
||||
Congratulations - you've now added a dataset to the OpenAssistant org!
|
||||
|
||||
@@ -0,0 +1,31 @@
|
||||
# Dataset instructions for {dataset_name}
|
||||
|
||||
## Setup
|
||||
|
||||
```bash
|
||||
python -m pip install -r requirements.txt
|
||||
```
|
||||
|
||||
## Download
|
||||
|
||||
```bash
|
||||
python download.py
|
||||
```
|
||||
|
||||
## Preprocess
|
||||
|
||||
```bash
|
||||
python preprocess.py
|
||||
```
|
||||
|
||||
## Load
|
||||
|
||||
```bash
|
||||
python load.py
|
||||
```
|
||||
|
||||
## Upload
|
||||
|
||||
```bash
|
||||
python upload.py
|
||||
```
|
||||
@@ -0,0 +1,2 @@
|
||||
datasets>=2.8
|
||||
typer
|
||||
@@ -0,0 +1,206 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# Copyright 2023 The OpenAssistant Authors and the current dataset script contributor.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""
|
||||
This template serves as a starting point for contributing a dataset to the OpenAssistant repo.
|
||||
|
||||
When modifying it for your dataset, look for TODO items that offer specific instructions.
|
||||
|
||||
To create a dataset loading script you will create a class and implement 3 methods:
|
||||
* `_info`: Establishes the schema for the dataset, and returns a datasets.DatasetInfo object.
|
||||
* `_split_generators`: Downloads and extracts data for each split (e.g. train/val/test) or associates local data with each split.
|
||||
* `_generate_examples`: Creates examples from data on disk that conform to each schema defined in `_info`.
|
||||
|
||||
Full documentation on writing dataset loading scripts can be found here:
|
||||
https://huggingface.co/docs/datasets/dataset_script
|
||||
|
||||
This template is adapted from the one provided by BigScience's BigBIO library:
|
||||
https://github.com/bigscience-workshop/biomedical/blob/main/templates/template.py
|
||||
|
||||
TODO: Before submitting your script, delete this docstring and replace it with a description of your dataset.
|
||||
"""
|
||||
|
||||
import os
|
||||
from typing import Dict, List, Tuple
|
||||
|
||||
import datasets
|
||||
|
||||
from openassistant.utils.configs import OpenAssistantConfig
|
||||
|
||||
# TODO: import the schema that fits your dataset:
|
||||
from openassistant.utils.schemas import
|
||||
|
||||
# TODO: Add BibTeX citation where appropriate
|
||||
_CITATION = """\
|
||||
@article{,
|
||||
author = {},
|
||||
title = {},
|
||||
journal = {},
|
||||
volume = {},
|
||||
year = {},
|
||||
url = {},
|
||||
doi = {},
|
||||
biburl = {},
|
||||
bibsource = {}
|
||||
}
|
||||
"""
|
||||
|
||||
# TODO: create a module level variable with your dataset name (should match the script name)
|
||||
# E.g. The Pile: [dataset_name] --> the_pile
|
||||
_DATASETNAME = "[dataset_name]"
|
||||
# TODO: create a pretty display name for your dataset
|
||||
_DISPLAYNAME = "Dataset Name"
|
||||
|
||||
# TODO: Add a description of the dataset here
|
||||
# You can copy an official description
|
||||
_DESCRIPTION = """\
|
||||
This dataset is designed for XXX NLP task.
|
||||
"""
|
||||
|
||||
# TODO: Add a link to an official homepage for the dataset here (if possible)
|
||||
_HOMEPAGE = ""
|
||||
|
||||
# TODO: Add the licence for the dataset here (if possible)
|
||||
# Note that this doesn't have to be a common open source license.
|
||||
# Some datasets have custom licenses. In this case, simply put the full license terms
|
||||
# into `_LICENSE`
|
||||
_LICENSE = ""
|
||||
|
||||
# TODO: Add links to the URLs needed to download your dataset files.
|
||||
# This variable can be a relative path for datasets whose files need to be
|
||||
# manually downloaded or preprocessed in advance.
|
||||
|
||||
# For publicly available datasets you will most likely end up passing these URLs to dl_manager in _split_generators.
|
||||
# However, if you need to access different files for each config you can have multiple entries in this dict.
|
||||
# This can be an arbitrarily nested dict/list of URLs (see below in `_split_generators` method)
|
||||
_URLS = {
|
||||
_DATASETNAME: "url or list of urls or relative path like ./data ",
|
||||
}
|
||||
|
||||
# TODO: add supported task by dataset. One dataset may support multiple tasks
|
||||
_SUPPORTED_TASKS = [] # example: [Tasks.TRANSLATION, Tasks.NAMED_ENTITY_RECOGNITION, Tasks.RELATION_EXTRACTION]
|
||||
|
||||
# TODO: set this to a version that is associated with the dataset. if none exists use "1.0.0"
|
||||
# This version doesn't have to be consistent with semantic versioning. Anything that is
|
||||
# provided by the original dataset as a version goes.
|
||||
_VERSION = ""
|
||||
|
||||
|
||||
# TODO: Name the dataset class to match the script name using CamelCase instead of snake_case
|
||||
# Append "Dataset" to the class name: ThePile --> ThePileDataset
|
||||
class NewDataset(datasets.GeneratorBasedBuilder):
|
||||
"""TODO: Short description of my dataset."""
|
||||
|
||||
VERSION = datasets.Version(_VERSION)
|
||||
|
||||
# You will be able to load each dataset with
|
||||
# dataset = datasets.load_dataset('my_dataset')
|
||||
|
||||
# TODO: For each dataset, implement a config for each subset;
|
||||
# If a dataset contains more than one subset, implement a config for EACH of them.
|
||||
# Each of them should contain:
|
||||
# - name: should be unique for each dataset config eg. the_pile_[schema_name]
|
||||
# - version: VERSION
|
||||
# - description: one line description for the dataset
|
||||
# - schema: open_assistant_[schema_name]
|
||||
# - subset_id: subset id is the canonical name for the dataset (eg. the_pile)
|
||||
# where [schema_name] = (language_modeling)
|
||||
|
||||
BUILDER_CONFIGS = [
|
||||
OpenAssistantConfig(
|
||||
name=f"{_DATASETNAME}_[schema_name]",
|
||||
version=VERSION,
|
||||
description=f"OpenAssistant dataset config for {_DATASETNAME}",
|
||||
schema_name="[schema_name]",
|
||||
subset_id=_DATASETNAME,
|
||||
)
|
||||
]
|
||||
|
||||
DEFAULT_CONFIG_NAME = _DATASETNAME
|
||||
|
||||
def _info(self) -> datasets.DatasetInfo:
|
||||
# TODO: Implement the schema for your dataset here.
|
||||
raise NotImplementedError()
|
||||
|
||||
return datasets.DatasetInfo(
|
||||
description=_DESCRIPTION,
|
||||
features=features,
|
||||
homepage=_HOMEPAGE,
|
||||
license=_LICENSE,
|
||||
citation=_CITATION,
|
||||
)
|
||||
|
||||
def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]:
|
||||
"""Returns SplitGenerators."""
|
||||
# TODO: This method is tasked with downloading/extracting the data and defining the splits depending on the configuration
|
||||
|
||||
# If you need to access a config choice, that will be in self.config.name
|
||||
|
||||
# dl_manager is a datasets.download.DownloadManager that can be used to download and extract URLs; many examples use the download_and_extract method; see the DownloadManager docs here: https://huggingface.co/docs/datasets/package_reference/builder_classes.html#datasets.DownloadManager
|
||||
|
||||
# dl_manager can accept any type of nested list/dict and will give back the same structure with the url replaced with the path to local files.
|
||||
|
||||
urls = _URLS[_DATASETNAME]
|
||||
data_dir = dl_manager.download_and_extract(urls)
|
||||
|
||||
# Not all datasets have predefined canonical train/val/test splits.
|
||||
# If your dataset has no predefined splits, use datasets.Split.TRAIN for all of the data.
|
||||
|
||||
return [
|
||||
datasets.SplitGenerator(
|
||||
name=datasets.Split.TRAIN,
|
||||
# Whatever you put in gen_kwargs will be passed to _generate_examples
|
||||
gen_kwargs={
|
||||
"filepath": os.path.join(data_dir, "train.jsonl"),
|
||||
"split": "train",
|
||||
},
|
||||
),
|
||||
datasets.SplitGenerator(
|
||||
name=datasets.Split.TEST,
|
||||
gen_kwargs={
|
||||
"filepath": os.path.join(data_dir, "test.jsonl"),
|
||||
"split": "test",
|
||||
},
|
||||
),
|
||||
datasets.SplitGenerator(
|
||||
name=datasets.Split.VALIDATION,
|
||||
gen_kwargs={
|
||||
"filepath": os.path.join(data_dir, "dev.jsonl"),
|
||||
"split": "dev",
|
||||
},
|
||||
),
|
||||
]
|
||||
|
||||
# method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
|
||||
|
||||
# TODO: change the args of this function to match the keys in `gen_kwargs`. You may add any necessary kwargs.
|
||||
|
||||
def _generate_examples(self, filepath, split: str) -> Tuple[int, Dict]:
|
||||
"""Yields examples as (key, example) tuples."""
|
||||
# TODO: This method handles input defined in _split_generators to yield (key, example) tuples from the dataset.
|
||||
|
||||
# The `key` is for legacy reasons (tfds) and is not important in itself, but must be unique for each example.
|
||||
|
||||
# NOTE: For local datasets you will have access to self.config.data_dir and self.config.data_files
|
||||
|
||||
if self.config.schema == "[schema_name]":
|
||||
# TODO: yield (key, example) tuples in the given schema
|
||||
for key, example in thing:
|
||||
yield key, example
|
||||
|
||||
# This allows you to run your dataloader with `python [dataset_name].py` during development
|
||||
# TODO: Remove this before making your PR
|
||||
if __name__ == "__main__":
|
||||
datasets.load_dataset(__file__)
|
||||
@@ -0,0 +1,15 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from dataclasses import dataclass
|
||||
|
||||
import datasets
|
||||
|
||||
|
||||
@dataclass
|
||||
class OpenAssistantConfig(datasets.BuilderConfig):
|
||||
"""BuilderConfig for OpenAssistant datasets."""
|
||||
|
||||
name: str = None
|
||||
version: datasets.Version = None
|
||||
description: str = None
|
||||
schema: str = None
|
||||
subset_id: str = None
|
||||
@@ -0,0 +1,12 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Language Modeling Schema
|
||||
"""
|
||||
import datasets
|
||||
|
||||
features = datasets.Features(
|
||||
{
|
||||
"text": datasets.Value("string"),
|
||||
"meta": [datasets.Value("string")],
|
||||
}
|
||||
)
|
||||
Reference in New Issue
Block a user