From 5b0f6c156737cd291db55ae9144366cbaf604a4e Mon Sep 17 00:00:00 2001 From: Lewis Tunstall Date: Wed, 4 Jan 2023 15:35:19 +1100 Subject: [PATCH] pre-commit fix --- .pre-commit-config.yaml | 2 +- docs/datasets.md | 162 +++++++++++++----- openassistant/templates/README.md | 5 +- .../{utils/configs.py => templates/hub.py} | 9 +- openassistant/templates/template.py | 7 +- openassistant/utils/__init__.py | 0 openassistant/utils/schemas/__init__.py | 0 .../utils/schemas/language_modeling.py | 12 -- 8 files changed, 131 insertions(+), 66 deletions(-) rename openassistant/{utils/configs.py => templates/hub.py} (69%) delete mode 100644 openassistant/utils/__init__.py delete mode 100644 openassistant/utils/schemas/__init__.py delete mode 100644 openassistant/utils/schemas/language_modeling.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index b7345327..7798b093 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -26,7 +26,7 @@ # # /WARNING! -exclude: "build|stubs|^bot/templates/|openassistant/templates/$" +exclude: "build|stubs|^bot/$|templates/|openassistant/templates" repos: - repo: https://github.com/pre-commit/pre-commit-hooks diff --git a/docs/datasets.md b/docs/datasets.md index c5400807..f905d74a 100644 --- a/docs/datasets.md +++ b/docs/datasets.md @@ -1,6 +1,9 @@ # Datasets -The datasets for this project are currently hosted as loading scripts on the [Open-Assistant organization](https://huggingface.co/OpenAssistant) the Hugging Face Hub. Each of them can be loaded by first installing the 🤗 Datasets library: +The datasets for this project are currently hosted as loading scripts on the +[Open-Assistant organization](https://huggingface.co/OpenAssistant) the Hugging +Face Hub. Each of them can be loaded by first installing the 🤗 Datasets +library: ```bash python -m pip install datasets @@ -14,19 +17,31 @@ from datasets import load_dataset dataset = load_dataset("OpenAssistant/{dataset-name}") ``` -We use this GitHub repository to accept new submissions and standardize quality control. See the instructions below if you'd like to contribute a new dataset to the project. +We use this GitHub repository to accept new submissions and standardize quality +control. See the instructions below if you'd like to contribute a new dataset to +the project. ## Adding a new dataset ### 0. Pre-Requisites -Install Git and create a GitHub account prior to implementing a dataset; you can follow instructions to install Git [here](https://git-scm.com/book/en/v2/Getting-Started-Installing-Git). +Install Git and create a GitHub account prior to implementing a dataset; you can +follow instructions to install Git +[here](https://git-scm.com/book/en/v2/Getting-Started-Installing-Git). -You will also need at least Python 3.8+. If you are installing Python, we recommend downloading [Anaconda](https://docs.anaconda.com/anaconda/install/index.html) to curate a python environment with necessary packages. **We strongly recommend Python 3.8+ for stability**. +You will also need at least Python 3.8+. If you are installing Python, we +recommend downloading +[Anaconda](https://docs.anaconda.com/anaconda/install/index.html) to curate a +python environment with necessary packages. **We strongly recommend Python 3.8+ +for stability**. ### 1. **Fork the OpenAssistant repository** -Fork the `OpenAssistant`[repository](https://github.com/LAION-AI/Open-Assistant). To do this, click the link to the repository and click "Fork" in the upper-right corner. You should get an option to fork to your account, provided you are signed into Github. +Fork the +`OpenAssistant`[repository](https://github.com/LAION-AI/Open-Assistant). To do +this, click the link to the repository and click "Fork" in the upper-right +corner. You should get an option to fork to your account, provided you are +signed into Github. After you fork, clone the repository locally. You can do so as follows: @@ -35,13 +50,15 @@ git clone git@github.com:/OpenAssistant.git cd OpenAssistant # enter the directory ``` -Next, you want to set your `upstream` location to enable you to push/pull (add or receive updates). You can do so as follows: +Next, you want to set your `upstream` location to enable you to push/pull (add +or receive updates). You can do so as follows: ```bash git remote add upstream git@github.com:LAION-AI/Open-Assistant.git ``` -You can optionally check that this was set properly by running the following command: +You can optionally check that this was set properly by running the following +command: ```bash git remote -v @@ -62,14 +79,18 @@ If you do NOT have an `origin` for whatever reason, then run: git remote add origin git@github.com:/OpenAssistant.git ``` -The goal of `upstream` is to keep your repository up-to-date to any changes that are made officially to the OpenAssistant repo. You can do this as follows by running the following commands: +The goal of `upstream` is to keep your repository up-to-date to any changes that +are made officially to the OpenAssistant repo. You can do this as follows by +running the following commands: ``` git fetch upstream git pull ``` -Provided you have no _merge conflicts_, this will ensure the repo stays up-to-date as you make changes. However, before you make changes, you should make a custom branch to implement your changes. +Provided you have no _merge conflicts_, this will ensure the repo stays +up-to-date as you make changes. However, before you make changes, you should +make a custom branch to implement your changes. You can make a new branch as such: @@ -89,14 +110,17 @@ The correct branch will have a asterisk \* in front of it. ### 2. **Create a development environment** -You can make an environment in any way you choose to. We highlight two possible options: +You can make an environment in any way you choose to. We highlight two possible +options: #### 2a) Create a conda environment The following instructions will create an Anaconda `openassistant` environment. -- Install [anaconda](https://docs.anaconda.com/anaconda/install/) for your appropriate operating system. -- Run the following command while in the `biomedical` folder (you can pick your python version): +- Install [anaconda](https://docs.anaconda.com/anaconda/install/) for your + appropriate operating system. +- Run the following command while in the `biomedical` folder (you can pick your + python version): ```bash conda create -n openassistant python=3.8 # Creates a conda env @@ -105,11 +129,13 @@ cd openassistant pip install -r dev-requirements.txt # Install this while in the openassistant folder ``` -You can deactivate your environment at any time by either exiting your terminal or using `conda deactivate`. +You can deactivate your environment at any time by either exiting your terminal +or using `conda deactivate`. #### 2b) Create a venv environment -Python 3.3+ has venv automatically installed; official information is found [here](https://packaging.python.org/en/latest/guides/installing-using-pip-and-virtual-environments/). +Python 3.3+ has venv automatically installed; official information is found +[here](https://packaging.python.org/en/latest/guides/installing-using-pip-and-virtual-environments/). ``` python3 -m venv @@ -128,7 +154,8 @@ Make a new directory within the `openassistant/datasets` directory: mkdir openassistant/datasets/ ``` -**NOTE**: Please use snake_case, i.e. lowercase letters and underscores when choosing a ``. +**NOTE**: Please use snake_case, i.e. lowercase letters and underscores when +choosing a ``. Add an `__init__.py` file to this directory: @@ -136,15 +163,21 @@ Add an `__init__.py` file to this directory: touch openassistant/datasets//__init__.py ``` -Next, copy the `template.py` script of `templates` into your dataset folder. This script has "TODOs" to fill in for your dataloading script: +Next, copy the `template.py` script and `hub.py` module of `templates` into your +dataset folder. The `template.py` script has "TODOs" to fill in for your +dataloading script: ```bash +cp templates/hub.py openassistant/datasets// cp templates/template.py openassistant/datasets//.py ``` #### (Optional) Prepare local dataset files -If your dataset files aren't publicly available via URLs (e.g. because you implemented a web scraper), you'll need to implement some extra logic to store and prepare the data locally prior to implementing a loading script in 🤗 Datasets. +If your dataset files aren't publicly available via URLs (e.g. because you +implemented a web scraper), you'll need to implement some extra logic to store +and prepare the data locally prior to implementing a loading script in 🤗 +Datasets. To do so, first copy the template script for dataset creation: @@ -152,53 +185,67 @@ To do so, first copy the template script for dataset creation: cp templates/prepare.py openassistant/datasets// ``` -Next, implement any logic that is needed to prepare a local version of the dataset files (by convention we store them in `datasets//data/`). Add any extra dependencies to a `requirements.txt` file and provide instructions on how to prepare the dataset files in a README: +Next, implement any logic that is needed to prepare a local version of the +dataset files (by convention we store them in `datasets//data/`). +Add any extra dependencies to a `requirements.txt` file and provide instructions +on how to prepare the dataset files in a README: ```bash touch openassistant/datasets//requirements.txt cp templates/README.py openassistant/datasets// ``` -**Note:** Do not commit any dataset files to the OpenAssistant repo - all data will be hosted on the Hugging Face Hub. This step is needed for the data admins to be able to replicate the dataset creation process before pushing to the Hub. +**Note:** Do not commit any dataset files to the OpenAssistant repo - all data +will be hosted on the Hugging Face Hub. This step is needed for the project's +data admins to be able to replicate the dataset creation process before pushing +to the Hub. ### 4. Implement your dataset -To implement your dataloader, you will need to follow `template.py` and fill in all necessary TODOs. There are three key methods that are important: +To implement your dataloader, you will need to follow `template.py` and fill in +all necessary TODOs. There are three key methods that are important: - `_info`: Specifies the schema of the expected dataloader -- `_split_generators`: Downloads and extracts data for each split (e.g. train/val/test) or associate local data with each split. -- `_generate_examples`: Create examples from data that conform to each schema defined in `_info`. +- `_split_generators`: Downloads and extracts data for each split (e.g. + train/val/test) or associate local data with each split. +- `_generate_examples`: Create examples from data that conform to each schema + defined in `_info`. For the `_info_` function, you will need to define `features` for your -`DatasetInfo` object. For each dataset config, choose the right schema from our list of examples. You can find the schemas in the [schemas directory](openassistant/utils/schemas/). +`DatasetInfo` object. For each dataset config, choose the right schema from our +list of examples. You can find the schemas in the +[schemas directory](openassistant/utils/schemas/). You will use this schema in the `_generate_examples` return value. -Populate the information in the dataset according to this schema; some fields may be empty. +Populate the information in the dataset according to this schema; some fields +may be empty. #### Example scripts TODO -#### Running & Debugging +#### Running & debugging -You can run your data loader script during development by appending the following -statement to your code ([templates/template.py](templates/template.py) already includes this): +You can run your data loader script during development by appending the +following statement to your code ([templates/template.py](templates/template.py) +already includes this): ```python if __name__ == "__main__": datasets.load_dataset(__file__) ``` -If you want to use an interactive debugger during development, you will have to use -`breakpoint()` instead of setting breakpoints directly in your IDE. Most IDEs will -recognize the `breakpoint()` statement and pause there during debugging. If your preferred -IDE doesn't support this, you can always run the script in your terminal and debug with -`pdb`. +If you want to use an interactive debugger during development, you will have to +use `breakpoint()` instead of setting breakpoints directly in your IDE. Most +IDEs will recognize the `breakpoint()` statement and pause there during +debugging. If your preferred IDE doesn't support this, you can always run the +script in your terminal and debug with `pdb`. ### 5. Check if your dataloader works -Make sure your dataset is implemented correctly by checking in python the following commands: +Make sure your dataset is implemented correctly by checking in python the +following commands: ```python from datasets import load_dataset @@ -224,7 +271,8 @@ From the main directory, run the code quality checks via the following command: pre-commit run --all-files ``` -This runs the black formatter, isort, and lints to ensure that the code is readable and looks nice. Flake8 linting errors may require manual changes. +This runs the black formatter, isort, and lints to ensure that the code is +readable and looks nice. Flake8 linting errors may require manual changes. ### 8. Commit your changes @@ -235,7 +283,8 @@ git add openassistant/datasets//*.py git commit -m "A message describing your commits" ``` -Then, run the following commands to incorporate any new changes in the master branch of datasets as follows: +Then, run the following commands to incorporate any new changes in the master +branch of datasets as follows: ``` git fetch upstream @@ -252,13 +301,20 @@ git push -u origin ### 9. **Make a pull request** -Make a Pull Request to implement your changes on the main repository [here](https://github.com/LAION-AI/Open-Assistant/pulls). To do so, click "New Pull Request". Then, choose your branch from your fork to push into "base:main". +Make a Pull Request to implement your changes on the main repository +[here](https://github.com/LAION-AI/Open-Assistant/pulls). To do so, click "New +Pull Request". Then, choose your branch from your fork to push into "base:main". -When opening a PR, please link the [issue](https://github.com/LAION-AI/Open-Assistant/issues) corresponding to your dataset using [closing keywords](https://docs.github.com/en/issues/tracking-your-work-with-issues/linking-a-pull-request-to-an-issue) in the PR's description, e.g. `resolves #17`. +When opening a PR, please link the +[issue](https://github.com/LAION-AI/Open-Assistant/issues) corresponding to your +dataset using +[closing keywords](https://docs.github.com/en/issues/tracking-your-work-with-issues/linking-a-pull-request-to-an-issue) +in the PR's description, e.g. `resolves #17`. ## [Admins] Uploading a dataset to the Hugging Face Hub -Uploading a new dataset from `openassistant/datasets/` to the Hugging Face Hub typically involves the following steps: +Uploading a new dataset from `openassistant/datasets/` to the +Hugging Face Hub typically involves the following steps: 1. Setup 2. Create a new dataset repository @@ -270,9 +326,15 @@ Uploading a new dataset from `openassistant/datasets/` to the Hugg To upload a dataset to the OpenAssistant organization, you first need to: - Create a [Hugging Face account](https://huggingface.co/join) (it's free) -- Join the [OpenAssistant organization](https://huggingface.co/OpenAssistant) by clicking on the _Request to join this org_ button on the top right-hand side +- Join the [OpenAssistant organization](https://huggingface.co/OpenAssistant) by + clicking on the _Request to join this org_ button on the top right-hand side -Next, check that you're correctly logged in and that `git-lfs` is installed so that the dataset can be uploaded. To log in, create a **write access token** that can be found under your Hugging Face profile (icon in the top right corner on [hf.co](http://hf.co/), then Settings -> Access Tokens -> User Access Tokens -> New Token. Alternatively, you can go to [your token settings](https://huggingface.co/settings/tokens) directly. +Next, check that you're correctly logged in and that `git-lfs` is installed so +that the dataset can be uploaded. To log in, create a **write access token** +that can be found under your Hugging Face profile (icon in the top right corner +on [hf.co](http://hf.co/), then Settings -> Access Tokens -> User Access Tokens +-> New Token. Alternatively, you can go to +[your token settings](https://huggingface.co/settings/tokens) directly. Once you've created a token, run: @@ -290,13 +352,17 @@ notebook_login() You can then copy-paste your token to log in locally. -Next, let's make sure that `git-lfs` is correctly installed. To do so, simply run: +Next, let's make sure that `git-lfs` is correctly installed. To do so, simply +run: ```bash git-lfs -v ``` -The output should show something like `git-lfs/2.13.2 (GitHub; linux amd64; go 1.15.4)`. If your console states that the `git-lfs` command was not found, please make sure to install it [here](https://git-lfs.github.com/) or simply via: +The output should show something like +`git-lfs/2.13.2 (GitHub; linux amd64; go 1.15.4)`. If your console states that +the `git-lfs` command was not found, please make sure to install it +[here](https://git-lfs.github.com/) or simply via: ```bash sudo apt-get install git-lfs @@ -312,7 +378,9 @@ python -m pip install datasets ### 2. Create a new dataset repository -Follow [this guide](https://huggingface.co/docs/datasets/upload_dataset) for instructions on creating a new dataset repo on the Hub. Use the same snake_case name as the dataset in `openassistant/datasets/`. +Follow [this guide](https://huggingface.co/docs/datasets/upload_dataset) for +instructions on creating a new dataset repo on the Hub. Use the same snake_case +name as the dataset in `openassistant/datasets/`. Once you've created the dataset repo, clone it by running: @@ -332,11 +400,15 @@ cp openassistant/datasets//README.md . #### (Optional) Prepare local dataset files -If the dataset files of `openassistant/datasets/` aren't public, you'll need to run the `openassistant/datasets//prepare.py` script to create them. Store them in the same directory that is specified by the loading script (`data` by default). +If the dataset files of `openassistant/datasets/` aren't public, +you'll need to run the `openassistant/datasets//prepare.py` script +to create them. Store them in the same directory that is specified by the +loading script (`data` by default). ### 4. Upload to the Hub -Once the dataset script and card are ready, use Git to push them to the Hub (along with any data files you may need). +Once the dataset script and card are ready, use Git to push them to the Hub +(along with any data files you may need). At this point, you can load the dataset by running: diff --git a/openassistant/templates/README.md b/openassistant/templates/README.md index 3d4eb28f..b174c47e 100644 --- a/openassistant/templates/README.md +++ b/openassistant/templates/README.md @@ -6,6 +6,5 @@ Add any installation details here. ## Usage -Explain how to run any scripts that involve special downloading (e.g. data is obtained from a web scraper) or data preprocessing. - - +Explain how to run any scripts that involve preparing local dataset files, e.g. +if the dataset files aren't public or are produced by a web scraper. diff --git a/openassistant/utils/configs.py b/openassistant/templates/hub.py similarity index 69% rename from openassistant/utils/configs.py rename to openassistant/templates/hub.py index 2391f66b..49194e31 100644 --- a/openassistant/utils/configs.py +++ b/openassistant/templates/hub.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- from dataclasses import dataclass import datasets @@ -13,3 +12,11 @@ class OpenAssistantConfig(datasets.BuilderConfig): description: str = None schema: str = None subset_id: str = None + + +lm_features = datasets.Features( + { + "text": datasets.Value("string"), + "meta": [datasets.Value("string")], + } +) diff --git a/openassistant/templates/template.py b/openassistant/templates/template.py index b6c52871..391df55f 100644 --- a/openassistant/templates/template.py +++ b/openassistant/templates/template.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # Copyright 2023 The OpenAssistant Authors and the current dataset script contributor. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -37,10 +36,10 @@ from typing import Dict, List, Tuple import datasets -from openassistant.utils.configs import OpenAssistantConfig +from .hub import OpenAssistantConfig -# TODO: import the schema that fits your dataset: -from openassistant.utils.schemas import +# TODO: import the schema (i.e. features) that fits your dataset: +from .hub import # TODO: Add BibTeX citation where appropriate _CITATION = """\ diff --git a/openassistant/utils/__init__.py b/openassistant/utils/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/openassistant/utils/schemas/__init__.py b/openassistant/utils/schemas/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/openassistant/utils/schemas/language_modeling.py b/openassistant/utils/schemas/language_modeling.py deleted file mode 100644 index 0b2072a5..00000000 --- a/openassistant/utils/schemas/language_modeling.py +++ /dev/null @@ -1,12 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Language Modeling Schema -""" -import datasets - -features = datasets.Features( - { - "text": datasets.Value("string"), - "meta": [datasets.Value("string")], - } -)