mirror of
https://github.com/wassname/Open-Assistant.git
synced 2026-06-27 16:10:30 +08:00
206 lines
8.3 KiB
Python
206 lines
8.3 KiB
Python
# Copyright 2023 The OpenAssistant Authors and the current dataset script contributor.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
"""
|
|
This template serves as a starting point for contributing a dataset to the OpenAssistant repo.
|
|
|
|
When modifying it for your dataset, look for TODO items that offer specific instructions.
|
|
|
|
To create a dataset loading script you will create a class and implement 3 methods:
|
|
* `_info`: Establishes the schema for the dataset, and returns a datasets.DatasetInfo object.
|
|
* `_split_generators`: Downloads and extracts data for each split (e.g. train/val/test) or associates local data with each split.
|
|
* `_generate_examples`: Creates examples from data on disk that conform to each schema defined in `_info`.
|
|
|
|
Full documentation on writing dataset loading scripts can be found here:
|
|
https://huggingface.co/docs/datasets/dataset_script
|
|
|
|
This template is adapted from the one provided by BigScience's BigBIO library:
|
|
https://github.com/bigscience-workshop/biomedical/blob/main/templates/template.py
|
|
|
|
TODO: Before submitting your script, delete this docstring and replace it with a description of your dataset.
|
|
"""
|
|
|
|
import os
|
|
from typing import Dict, List, Tuple
|
|
|
|
import datasets
|
|
|
|
from .hub import OpenAssistantConfig
|
|
|
|
# TODO: import the schema (i.e. features) that fits your dataset:
|
|
from .hub import
|
|
|
|
# TODO: Add BibTeX citation where appropriate
|
|
_CITATION = """\
|
|
@article{,
|
|
author = {},
|
|
title = {},
|
|
journal = {},
|
|
volume = {},
|
|
year = {},
|
|
url = {},
|
|
doi = {},
|
|
biburl = {},
|
|
bibsource = {}
|
|
}
|
|
"""
|
|
|
|
# TODO: create a module level variable with your dataset name (should match the script name)
|
|
# E.g. The Pile: [dataset_name] --> the_pile
|
|
_DATASETNAME = "[dataset_name]"
|
|
# TODO: create a pretty display name for your dataset
|
|
_DISPLAYNAME = "Dataset Name"
|
|
|
|
# TODO: Add a description of the dataset here
|
|
# You can copy an official description
|
|
_DESCRIPTION = """\
|
|
This dataset is designed for XXX NLP task.
|
|
"""
|
|
|
|
# TODO: Add a link to an official homepage for the dataset here (if possible)
|
|
_HOMEPAGE = ""
|
|
|
|
# TODO: Add the licence for the dataset here (if possible)
|
|
# Note that this doesn't have to be a common open source license.
|
|
# Some datasets have custom licenses. In this case, simply put the full license terms
|
|
# into `_LICENSE`
|
|
_LICENSE = ""
|
|
|
|
# TODO: Add links to the URLs needed to download your dataset files.
|
|
# This variable can be a relative path for datasets whose files need to be
|
|
# manually downloaded or preprocessed in advance.
|
|
|
|
# For publicly available datasets you will most likely end up passing these URLs to dl_manager in _split_generators.
|
|
# However, if you need to access different files for each config you can have multiple entries in this dict.
|
|
# This can be an arbitrarily nested dict/list of URLs (see below in `_split_generators` method)
|
|
_URLS = {
|
|
_DATASETNAME: "url or list of urls or relative path like ./data ",
|
|
}
|
|
|
|
# TODO: add supported task by dataset. One dataset may support multiple tasks
|
|
_SUPPORTED_TASKS = [] # example: [Tasks.TRANSLATION, Tasks.NAMED_ENTITY_RECOGNITION, Tasks.RELATION_EXTRACTION]
|
|
|
|
# TODO: set this to a version that is associated with the dataset. if none exists use "1.0.0"
|
|
# This version doesn't have to be consistent with semantic versioning. Anything that is
|
|
# provided by the original dataset as a version goes.
|
|
_VERSION = ""
|
|
|
|
|
|
# TODO: Name the dataset class to match the script name using CamelCase instead of snake_case
|
|
# Append "Dataset" to the class name: ThePile --> ThePileDataset
|
|
class NewDataset(datasets.GeneratorBasedBuilder):
|
|
"""TODO: Short description of my dataset."""
|
|
|
|
VERSION = datasets.Version(_VERSION)
|
|
|
|
# You will be able to load each dataset with
|
|
# dataset = datasets.load_dataset('my_dataset')
|
|
|
|
# TODO: For each dataset, implement a config for each subset;
|
|
# If a dataset contains more than one subset, implement a config for EACH of them.
|
|
# Each of them should contain:
|
|
# - name: should be unique for each dataset config eg. the_pile_[schema_name]
|
|
# - version: VERSION
|
|
# - description: one line description for the dataset
|
|
# - schema: open_assistant_[schema_name]
|
|
# - subset_id: subset id is the canonical name for the dataset (eg. the_pile)
|
|
# where [schema_name] = (language_modeling)
|
|
|
|
BUILDER_CONFIGS = [
|
|
OpenAssistantConfig(
|
|
name=f"{_DATASETNAME}_[schema_name]",
|
|
version=VERSION,
|
|
description=f"OpenAssistant dataset config for {_DATASETNAME}",
|
|
schema_name="[schema_name]",
|
|
subset_id=_DATASETNAME,
|
|
)
|
|
]
|
|
|
|
DEFAULT_CONFIG_NAME = _DATASETNAME
|
|
|
|
def _info(self) -> datasets.DatasetInfo:
|
|
# TODO: Implement the schema for your dataset here.
|
|
raise NotImplementedError()
|
|
|
|
return datasets.DatasetInfo(
|
|
description=_DESCRIPTION,
|
|
features=features,
|
|
homepage=_HOMEPAGE,
|
|
license=_LICENSE,
|
|
citation=_CITATION,
|
|
)
|
|
|
|
def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]:
|
|
"""Returns SplitGenerators."""
|
|
# TODO: This method is tasked with downloading/extracting the data and defining the splits depending on the configuration
|
|
|
|
# If you need to access a config choice, that will be in self.config.name
|
|
|
|
# dl_manager is a datasets.download.DownloadManager that can be used to download and extract URLs; many examples use the download_and_extract method; see the DownloadManager docs here: https://huggingface.co/docs/datasets/package_reference/builder_classes.html#datasets.DownloadManager
|
|
|
|
# dl_manager can accept any type of nested list/dict and will give back the same structure with the url replaced with the path to local files.
|
|
|
|
urls = _URLS[_DATASETNAME]
|
|
data_dir = dl_manager.download_and_extract(urls)
|
|
|
|
# Not all datasets have predefined canonical train/val/test splits.
|
|
# If your dataset has no predefined splits, use datasets.Split.TRAIN for all of the data.
|
|
|
|
return [
|
|
datasets.SplitGenerator(
|
|
name=datasets.Split.TRAIN,
|
|
# Whatever you put in gen_kwargs will be passed to _generate_examples
|
|
gen_kwargs={
|
|
"filepath": os.path.join(data_dir, "train.jsonl"),
|
|
"split": "train",
|
|
},
|
|
),
|
|
datasets.SplitGenerator(
|
|
name=datasets.Split.TEST,
|
|
gen_kwargs={
|
|
"filepath": os.path.join(data_dir, "test.jsonl"),
|
|
"split": "test",
|
|
},
|
|
),
|
|
datasets.SplitGenerator(
|
|
name=datasets.Split.VALIDATION,
|
|
gen_kwargs={
|
|
"filepath": os.path.join(data_dir, "dev.jsonl"),
|
|
"split": "dev",
|
|
},
|
|
),
|
|
]
|
|
|
|
# method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
|
|
|
|
# TODO: change the args of this function to match the keys in `gen_kwargs`. You may add any necessary kwargs.
|
|
|
|
def _generate_examples(self, filepath, split: str) -> Tuple[int, Dict]:
|
|
"""Yields examples as (key, example) tuples."""
|
|
# TODO: This method handles input defined in _split_generators to yield (key, example) tuples from the dataset.
|
|
|
|
# The `key` is for legacy reasons (tfds) and is not important in itself, but must be unique for each example.
|
|
|
|
# NOTE: For local datasets you will have access to self.config.data_dir and self.config.data_files
|
|
|
|
if self.config.schema == "[schema_name]":
|
|
# TODO: yield (key, example) tuples in the given schema
|
|
for key, example in thing:
|
|
yield key, example
|
|
|
|
# This allows you to run your dataloader with `python [dataset_name].py` during development
|
|
# TODO: Remove this before making your PR
|
|
if __name__ == "__main__":
|
|
datasets.load_dataset(__file__)
|