Files
Lewis Tunstall 5b0f6c1567 pre-commit fix
2023-01-04 15:35:19 +11:00

206 lines
8.3 KiB
Python

# Copyright 2023 The OpenAssistant Authors and the current dataset script contributor.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This template serves as a starting point for contributing a dataset to the OpenAssistant repo.
When modifying it for your dataset, look for TODO items that offer specific instructions.
To create a dataset loading script you will create a class and implement 3 methods:
* `_info`: Establishes the schema for the dataset, and returns a datasets.DatasetInfo object.
* `_split_generators`: Downloads and extracts data for each split (e.g. train/val/test) or associates local data with each split.
* `_generate_examples`: Creates examples from data on disk that conform to each schema defined in `_info`.
Full documentation on writing dataset loading scripts can be found here:
https://huggingface.co/docs/datasets/dataset_script
This template is adapted from the one provided by BigScience's BigBIO library:
https://github.com/bigscience-workshop/biomedical/blob/main/templates/template.py
TODO: Before submitting your script, delete this docstring and replace it with a description of your dataset.
"""
import os
from typing import Dict, List, Tuple
import datasets
from .hub import OpenAssistantConfig
# TODO: import the schema (i.e. features) that fits your dataset:
from .hub import
# TODO: Add BibTeX citation where appropriate
_CITATION = """\
@article{,
author = {},
title = {},
journal = {},
volume = {},
year = {},
url = {},
doi = {},
biburl = {},
bibsource = {}
}
"""
# TODO: create a module level variable with your dataset name (should match the script name)
# E.g. The Pile: [dataset_name] --> the_pile
_DATASETNAME = "[dataset_name]"
# TODO: create a pretty display name for your dataset
_DISPLAYNAME = "Dataset Name"
# TODO: Add a description of the dataset here
# You can copy an official description
_DESCRIPTION = """\
This dataset is designed for XXX NLP task.
"""
# TODO: Add a link to an official homepage for the dataset here (if possible)
_HOMEPAGE = ""
# TODO: Add the licence for the dataset here (if possible)
# Note that this doesn't have to be a common open source license.
# Some datasets have custom licenses. In this case, simply put the full license terms
# into `_LICENSE`
_LICENSE = ""
# TODO: Add links to the URLs needed to download your dataset files.
# This variable can be a relative path for datasets whose files need to be
# manually downloaded or preprocessed in advance.
# For publicly available datasets you will most likely end up passing these URLs to dl_manager in _split_generators.
# However, if you need to access different files for each config you can have multiple entries in this dict.
# This can be an arbitrarily nested dict/list of URLs (see below in `_split_generators` method)
_URLS = {
_DATASETNAME: "url or list of urls or relative path like ./data ",
}
# TODO: add supported task by dataset. One dataset may support multiple tasks
_SUPPORTED_TASKS = [] # example: [Tasks.TRANSLATION, Tasks.NAMED_ENTITY_RECOGNITION, Tasks.RELATION_EXTRACTION]
# TODO: set this to a version that is associated with the dataset. if none exists use "1.0.0"
# This version doesn't have to be consistent with semantic versioning. Anything that is
# provided by the original dataset as a version goes.
_VERSION = ""
# TODO: Name the dataset class to match the script name using CamelCase instead of snake_case
# Append "Dataset" to the class name: ThePile --> ThePileDataset
class NewDataset(datasets.GeneratorBasedBuilder):
"""TODO: Short description of my dataset."""
VERSION = datasets.Version(_VERSION)
# You will be able to load each dataset with
# dataset = datasets.load_dataset('my_dataset')
# TODO: For each dataset, implement a config for each subset;
# If a dataset contains more than one subset, implement a config for EACH of them.
# Each of them should contain:
# - name: should be unique for each dataset config eg. the_pile_[schema_name]
# - version: VERSION
# - description: one line description for the dataset
# - schema: open_assistant_[schema_name]
# - subset_id: subset id is the canonical name for the dataset (eg. the_pile)
# where [schema_name] = (language_modeling)
BUILDER_CONFIGS = [
OpenAssistantConfig(
name=f"{_DATASETNAME}_[schema_name]",
version=VERSION,
description=f"OpenAssistant dataset config for {_DATASETNAME}",
schema_name="[schema_name]",
subset_id=_DATASETNAME,
)
]
DEFAULT_CONFIG_NAME = _DATASETNAME
def _info(self) -> datasets.DatasetInfo:
# TODO: Implement the schema for your dataset here.
raise NotImplementedError()
return datasets.DatasetInfo(
description=_DESCRIPTION,
features=features,
homepage=_HOMEPAGE,
license=_LICENSE,
citation=_CITATION,
)
def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]:
"""Returns SplitGenerators."""
# TODO: This method is tasked with downloading/extracting the data and defining the splits depending on the configuration
# If you need to access a config choice, that will be in self.config.name
# dl_manager is a datasets.download.DownloadManager that can be used to download and extract URLs; many examples use the download_and_extract method; see the DownloadManager docs here: https://huggingface.co/docs/datasets/package_reference/builder_classes.html#datasets.DownloadManager
# dl_manager can accept any type of nested list/dict and will give back the same structure with the url replaced with the path to local files.
urls = _URLS[_DATASETNAME]
data_dir = dl_manager.download_and_extract(urls)
# Not all datasets have predefined canonical train/val/test splits.
# If your dataset has no predefined splits, use datasets.Split.TRAIN for all of the data.
return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
# Whatever you put in gen_kwargs will be passed to _generate_examples
gen_kwargs={
"filepath": os.path.join(data_dir, "train.jsonl"),
"split": "train",
},
),
datasets.SplitGenerator(
name=datasets.Split.TEST,
gen_kwargs={
"filepath": os.path.join(data_dir, "test.jsonl"),
"split": "test",
},
),
datasets.SplitGenerator(
name=datasets.Split.VALIDATION,
gen_kwargs={
"filepath": os.path.join(data_dir, "dev.jsonl"),
"split": "dev",
},
),
]
# method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
# TODO: change the args of this function to match the keys in `gen_kwargs`. You may add any necessary kwargs.
def _generate_examples(self, filepath, split: str) -> Tuple[int, Dict]:
"""Yields examples as (key, example) tuples."""
# TODO: This method handles input defined in _split_generators to yield (key, example) tuples from the dataset.
# The `key` is for legacy reasons (tfds) and is not important in itself, but must be unique for each example.
# NOTE: For local datasets you will have access to self.config.data_dir and self.config.data_files
if self.config.schema == "[schema_name]":
# TODO: yield (key, example) tuples in the given schema
for key, example in thing:
yield key, example
# This allows you to run your dataloader with `python [dataset_name].py` during development
# TODO: Remove this before making your PR
if __name__ == "__main__":
datasets.load_dataset(__file__)