# Copyright 2023 The OpenAssistant Authors and the current dataset script contributor. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ This template serves as a starting point for contributing a dataset to the OpenAssistant repo. When modifying it for your dataset, look for TODO items that offer specific instructions. To create a dataset loading script you will create a class and implement 3 methods: * `_info`: Establishes the schema for the dataset, and returns a datasets.DatasetInfo object. * `_split_generators`: Downloads and extracts data for each split (e.g. train/val/test) or associates local data with each split. * `_generate_examples`: Creates examples from data on disk that conform to each schema defined in `_info`. Full documentation on writing dataset loading scripts can be found here: https://huggingface.co/docs/datasets/dataset_script This template is adapted from the one provided by BigScience's BigBIO library: https://github.com/bigscience-workshop/biomedical/blob/main/templates/template.py TODO: Before submitting your script, delete this docstring and replace it with a description of your dataset. """ import os from typing import Dict, List, Tuple import datasets from .hub import OpenAssistantConfig # TODO: import the schema (i.e. features) that fits your dataset: from .hub import # TODO: Add BibTeX citation where appropriate _CITATION = """\ @article{, author = {}, title = {}, journal = {}, volume = {}, year = {}, url = {}, doi = {}, biburl = {}, bibsource = {} } """ # TODO: create a module level variable with your dataset name (should match the script name) # E.g. The Pile: [dataset_name] --> the_pile _DATASETNAME = "[dataset_name]" # TODO: create a pretty display name for your dataset _DISPLAYNAME = "Dataset Name" # TODO: Add a description of the dataset here # You can copy an official description _DESCRIPTION = """\ This dataset is designed for XXX NLP task. """ # TODO: Add a link to an official homepage for the dataset here (if possible) _HOMEPAGE = "" # TODO: Add the licence for the dataset here (if possible) # Note that this doesn't have to be a common open source license. # Some datasets have custom licenses. In this case, simply put the full license terms # into `_LICENSE` _LICENSE = "" # TODO: Add links to the URLs needed to download your dataset files. # This variable can be a relative path for datasets whose files need to be # manually downloaded or preprocessed in advance. # For publicly available datasets you will most likely end up passing these URLs to dl_manager in _split_generators. # However, if you need to access different files for each config you can have multiple entries in this dict. # This can be an arbitrarily nested dict/list of URLs (see below in `_split_generators` method) _URLS = { _DATASETNAME: "url or list of urls or relative path like ./data ", } # TODO: add supported task by dataset. One dataset may support multiple tasks _SUPPORTED_TASKS = [] # example: [Tasks.TRANSLATION, Tasks.NAMED_ENTITY_RECOGNITION, Tasks.RELATION_EXTRACTION] # TODO: set this to a version that is associated with the dataset. if none exists use "1.0.0" # This version doesn't have to be consistent with semantic versioning. Anything that is # provided by the original dataset as a version goes. _VERSION = "" # TODO: Name the dataset class to match the script name using CamelCase instead of snake_case # Append "Dataset" to the class name: ThePile --> ThePileDataset class NewDataset(datasets.GeneratorBasedBuilder): """TODO: Short description of my dataset.""" VERSION = datasets.Version(_VERSION) # You will be able to load each dataset with # dataset = datasets.load_dataset('my_dataset') # TODO: For each dataset, implement a config for each subset; # If a dataset contains more than one subset, implement a config for EACH of them. # Each of them should contain: # - name: should be unique for each dataset config eg. the_pile_[schema_name] # - version: VERSION # - description: one line description for the dataset # - schema: open_assistant_[schema_name] # - subset_id: subset id is the canonical name for the dataset (eg. the_pile) # where [schema_name] = (language_modeling) BUILDER_CONFIGS = [ OpenAssistantConfig( name=f"{_DATASETNAME}_[schema_name]", version=VERSION, description=f"OpenAssistant dataset config for {_DATASETNAME}", schema_name="[schema_name]", subset_id=_DATASETNAME, ) ] DEFAULT_CONFIG_NAME = _DATASETNAME def _info(self) -> datasets.DatasetInfo: # TODO: Implement the schema for your dataset here. raise NotImplementedError() return datasets.DatasetInfo( description=_DESCRIPTION, features=features, homepage=_HOMEPAGE, license=_LICENSE, citation=_CITATION, ) def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]: """Returns SplitGenerators.""" # TODO: This method is tasked with downloading/extracting the data and defining the splits depending on the configuration # If you need to access a config choice, that will be in self.config.name # dl_manager is a datasets.download.DownloadManager that can be used to download and extract URLs; many examples use the download_and_extract method; see the DownloadManager docs here: https://huggingface.co/docs/datasets/package_reference/builder_classes.html#datasets.DownloadManager # dl_manager can accept any type of nested list/dict and will give back the same structure with the url replaced with the path to local files. urls = _URLS[_DATASETNAME] data_dir = dl_manager.download_and_extract(urls) # Not all datasets have predefined canonical train/val/test splits. # If your dataset has no predefined splits, use datasets.Split.TRAIN for all of the data. return [ datasets.SplitGenerator( name=datasets.Split.TRAIN, # Whatever you put in gen_kwargs will be passed to _generate_examples gen_kwargs={ "filepath": os.path.join(data_dir, "train.jsonl"), "split": "train", }, ), datasets.SplitGenerator( name=datasets.Split.TEST, gen_kwargs={ "filepath": os.path.join(data_dir, "test.jsonl"), "split": "test", }, ), datasets.SplitGenerator( name=datasets.Split.VALIDATION, gen_kwargs={ "filepath": os.path.join(data_dir, "dev.jsonl"), "split": "dev", }, ), ] # method parameters are unpacked from `gen_kwargs` as given in `_split_generators` # TODO: change the args of this function to match the keys in `gen_kwargs`. You may add any necessary kwargs. def _generate_examples(self, filepath, split: str) -> Tuple[int, Dict]: """Yields examples as (key, example) tuples.""" # TODO: This method handles input defined in _split_generators to yield (key, example) tuples from the dataset. # The `key` is for legacy reasons (tfds) and is not important in itself, but must be unique for each example. # NOTE: For local datasets you will have access to self.config.data_dir and self.config.data_files if self.config.schema == "[schema_name]": # TODO: yield (key, example) tuples in the given schema for key, example in thing: yield key, example # This allows you to run your dataloader with `python [dataset_name].py` during development # TODO: Remove this before making your PR if __name__ == "__main__": datasets.load_dataset(__file__)