From 610a1a2de42d50cfa8fb276070aa7132f5a64900 Mon Sep 17 00:00:00 2001 From: Lewis Tunstall Date: Fri, 10 Nov 2023 08:37:53 +0000 Subject: [PATCH] Add unit tests for data mixer --- .github/workflows/tests.yml | 31 +++++++++++++++ Makefile | 10 ++--- README.md | 4 +- setup.py | 2 + tests/test_data.py | 79 +++++++++++++++++++++++++++++++++++++ 5 files changed, 120 insertions(+), 6 deletions(-) create mode 100644 .github/workflows/tests.yml create mode 100644 tests/test_data.py diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml new file mode 100644 index 0000000..990795f --- /dev/null +++ b/.github/workflows/tests.yml @@ -0,0 +1,31 @@ +name: Tests + +on: + push: + branches: + - main + - v*-release + pull_request: + branches: + - main + +jobs: + + unit-tests: + name: Run unit tests + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@v2 + - name: Setup Python environment + uses: actions/setup-python@v2 + with: + python-version: 3.10.10 + - name: Install dependencies + run: | + python -m pip install --upgrade pip + python -m pip install ".[dev, torch]" + - name: Run unit tests + run: HF_TOKEN=$HF_TOKEN pytest -sv tests/ \ No newline at end of file diff --git a/Makefile b/Makefile index 2d82400..e2e4d2c 100644 --- a/Makefile +++ b/Makefile @@ -6,13 +6,13 @@ export PYTHONPATH = src check_dirs := src tests scripts style: - python -m black --line-length 119 --target-version py310 $(check_dirs) setup.py - python -m isort $(check_dirs) setup.py + black --line-length 119 --target-version py310 $(check_dirs) setup.py + isort $(check_dirs) setup.py quality: - python -m black --check --line-length 119 --target-version py310 $(check_dirs) setup.py - python -m isort --check-only $(check_dirs) setup.py - python -m flake8 --max-line-length 119 $(check_dirs) setup.py + black --check --line-length 119 --target-version py310 $(check_dirs) setup.py + isort --check-only $(check_dirs) setup.py + flake8 --max-line-length 119 $(check_dirs) setup.py # Release stuff diff --git a/README.md b/README.md index 31ae641..952b7b3 100644 --- a/README.md +++ b/README.md @@ -36,7 +36,7 @@ To run the code in this project, first create a Python virtual environment using conda create -n handbook python=3.10 && conda activate handbook ``` -Next, install PyTorch `v2.1.0` - the precise version is important for reproducibility! Since this hardware-dependent, we +Next, install PyTorch `v2.1.0` - the precise version is important for reproducibility! Since this is hardware-dependent, we direct you to the [PyTorch Installation Page](https://pytorch.org/get-started/locally/). You can then install the remaining package dependencies as follows: @@ -63,6 +63,8 @@ Finally, install Git LFS so that you can push models to the Hugging Face Hub: sudo apt-get install git-lfs ``` +You can now checkout the `scripts` and `recipes` directories for instructions on how to train some models 🪁! + ## Project structure ``` diff --git a/setup.py b/setup.py index d71b591..1d2af6a 100644 --- a/setup.py +++ b/setup.py @@ -62,6 +62,7 @@ _deps = [ "safetensors>=0.3.3", "scipy", "tensorboard", + "torch==2.1.0", "transformers==4.35.0", "trl==0.7.4", "tqdm>=4.64.1", @@ -82,6 +83,7 @@ def deps_list(*pkgs): extras = {} extras["tests"] = deps_list("pytest", "parameterized") +extras["torch"] = deps_list("torch") extras["quality"] = deps_list("black", "isort", "flake8") extras["docs"] = deps_list("hf-doc-builder") extras["dev"] = extras["docs"] + extras["quality"] + extras["tests"] diff --git a/tests/test_data.py b/tests/test_data.py new file mode 100644 index 0000000..63e390a --- /dev/null +++ b/tests/test_data.py @@ -0,0 +1,79 @@ +# coding=utf-8 +# Copyright 2023 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import unittest + +import pytest + +from alignment import DataArguments, get_datasets + + +class GetDatasetsTest(unittest.TestCase): + """Each of these test datasets has 100 examples""" + + def test_loading_data_args(self): + dataset_mixer = { + "HuggingFaceH4/testing_alpaca_small": 0.5, + "HuggingFaceH4/testing_self_instruct_small": 0.3, + "HuggingFaceH4/testing_codealpaca_small": 0.2, + } + data_args = DataArguments(dataset_mixer=dataset_mixer) + datasets = get_datasets(data_args) + self.assertEqual(len(datasets["train"]), 100) + self.assertEqual(len(datasets["test"]), 300) + + def test_loading_data_dict(self): + dataset_mixer = { + "HuggingFaceH4/testing_alpaca_small": 0.5, + "HuggingFaceH4/testing_self_instruct_small": 0.3, + "HuggingFaceH4/testing_codealpaca_small": 0.2, + } + datasets = get_datasets(dataset_mixer) + self.assertEqual(len(datasets["train"]), 100) + self.assertEqual(len(datasets["test"]), 300) + + def test_loading_with_unit_fractions(self): + dataset_mixer = { + "HuggingFaceH4/testing_alpaca_small": 1.0, + "HuggingFaceH4/testing_self_instruct_small": 1.0, + "HuggingFaceH4/testing_codealpaca_small": 1.0, + } + datasets = get_datasets(dataset_mixer) + self.assertEqual(len(datasets["train"]), 300) + self.assertEqual(len(datasets["test"]), 300) + + def test_loading_with_fractions_greater_than_unity(self): + dataset_mixer = { + "HuggingFaceH4/testing_alpaca_small": 0.7, + "HuggingFaceH4/testing_self_instruct_small": 0.4, + } + datasets = get_datasets(dataset_mixer) + self.assertEqual(len(datasets["train"]), 70 + 40) + self.assertEqual(len(datasets["test"]), 200) + + def test_loading_fails_with_negative_fractions(self): + dataset_mixer = { + "HuggingFaceH4/testing_alpaca_small": 0.7, + "HuggingFaceH4/testing_self_instruct_small": -0.3, + } + with pytest.raises(ValueError, match=r"Dataset fractions cannot be negative."): + get_datasets(dataset_mixer) + + def test_loading_single_split_with_unit_fractions(self): + dataset_mixer = { + "HuggingFaceH4/testing_alpaca_small": 1.0, + } + datasets = get_datasets(dataset_mixer, splits=["test"]) + self.assertEqual(len(datasets["test"]), 100) + self.assertRaises(KeyError, lambda: datasets["train"])