From 610a1a2de42d50cfa8fb276070aa7132f5a64900 Mon Sep 17 00:00:00 2001
From: Lewis Tunstall <lewis.c.tunstall@gmail.com>
Date: Fri, 10 Nov 2023 08:37:53 +0000
Subject: [PATCH] Add unit tests for data mixer

---
 .github/workflows/tests.yml | 31 +++++++++++++++
 Makefile                    | 10 ++---
 README.md                   |  4 +-
 setup.py                    |  2 +
 tests/test_data.py          | 79 +++++++++++++++++++++++++++++++++++++
 5 files changed, 120 insertions(+), 6 deletions(-)
 create mode 100644 .github/workflows/tests.yml
 create mode 100644 tests/test_data.py

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
new file mode 100644
index 0000000..990795f
--- /dev/null
+++ b/.github/workflows/tests.yml
@@ -0,0 +1,31 @@
+name: Tests
+
+on:
+  push:
+    branches:
+      - main
+      - v*-release
+  pull_request:
+    branches:
+      - main
+
+jobs:
+
+  unit-tests:
+    name: Run unit tests
+    env:
+      HF_TOKEN: ${{ secrets.HF_TOKEN }}
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v2
+      - name: Setup Python environment
+        uses: actions/setup-python@v2
+        with:
+          python-version: 3.10.10
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          python -m pip install ".[dev, torch]"
+      - name: Run unit tests
+        run: HF_TOKEN=$HF_TOKEN pytest -sv tests/
\ No newline at end of file
diff --git a/Makefile b/Makefile
index 2d82400..e2e4d2c 100644
--- a/Makefile
+++ b/Makefile
@@ -6,13 +6,13 @@ export PYTHONPATH = src
 check_dirs := src tests scripts
 
 style:
-	python -m black --line-length 119 --target-version py310 $(check_dirs) setup.py
-	python -m isort $(check_dirs) setup.py
+	black --line-length 119 --target-version py310 $(check_dirs) setup.py
+	isort $(check_dirs) setup.py
 
 quality:
-	python -m black --check --line-length 119 --target-version py310 $(check_dirs) setup.py
-	python -m isort --check-only $(check_dirs) setup.py
-	python -m flake8 --max-line-length 119 $(check_dirs) setup.py
+	black --check --line-length 119 --target-version py310 $(check_dirs) setup.py
+	isort --check-only $(check_dirs) setup.py
+	flake8 --max-line-length 119 $(check_dirs) setup.py
 
 
 # Release stuff
diff --git a/README.md b/README.md
index 31ae641..952b7b3 100644
--- a/README.md
+++ b/README.md
@@ -36,7 +36,7 @@ To run the code in this project, first create a Python virtual environment using
 conda create -n handbook python=3.10 && conda activate handbook
 ```
 
-Next, install PyTorch `v2.1.0` - the precise version is important for reproducibility! Since this hardware-dependent, we
+Next, install PyTorch `v2.1.0` - the precise version is important for reproducibility! Since this is hardware-dependent, we
 direct you to the [PyTorch Installation Page](https://pytorch.org/get-started/locally/).
 
 You can then install the remaining package dependencies as follows:
@@ -63,6 +63,8 @@ Finally, install Git LFS so that you can push models to the Hugging Face Hub:
 sudo apt-get install git-lfs
 ```
 
+You can now checkout the `scripts` and `recipes` directories for instructions on how to train some models 🪁!
+
 ## Project structure
 
 ```
diff --git a/setup.py b/setup.py
index d71b591..1d2af6a 100644
--- a/setup.py
+++ b/setup.py
@@ -62,6 +62,7 @@ _deps = [
     "safetensors>=0.3.3",
     "scipy",
     "tensorboard",
+    "torch==2.1.0",
     "transformers==4.35.0",
     "trl==0.7.4",
     "tqdm>=4.64.1",
@@ -82,6 +83,7 @@ def deps_list(*pkgs):
 
 extras = {}
 extras["tests"] = deps_list("pytest", "parameterized")
+extras["torch"] = deps_list("torch")
 extras["quality"] = deps_list("black", "isort", "flake8")
 extras["docs"] = deps_list("hf-doc-builder")
 extras["dev"] = extras["docs"] + extras["quality"] + extras["tests"]
diff --git a/tests/test_data.py b/tests/test_data.py
new file mode 100644
index 0000000..63e390a
--- /dev/null
+++ b/tests/test_data.py
@@ -0,0 +1,79 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+
+import pytest
+
+from alignment import DataArguments, get_datasets
+
+
+class GetDatasetsTest(unittest.TestCase):
+    """Each of these test datasets has 100 examples"""
+
+    def test_loading_data_args(self):
+        dataset_mixer = {
+            "HuggingFaceH4/testing_alpaca_small": 0.5,
+            "HuggingFaceH4/testing_self_instruct_small": 0.3,
+            "HuggingFaceH4/testing_codealpaca_small": 0.2,
+        }
+        data_args = DataArguments(dataset_mixer=dataset_mixer)
+        datasets = get_datasets(data_args)
+        self.assertEqual(len(datasets["train"]), 100)
+        self.assertEqual(len(datasets["test"]), 300)
+
+    def test_loading_data_dict(self):
+        dataset_mixer = {
+            "HuggingFaceH4/testing_alpaca_small": 0.5,
+            "HuggingFaceH4/testing_self_instruct_small": 0.3,
+            "HuggingFaceH4/testing_codealpaca_small": 0.2,
+        }
+        datasets = get_datasets(dataset_mixer)
+        self.assertEqual(len(datasets["train"]), 100)
+        self.assertEqual(len(datasets["test"]), 300)
+
+    def test_loading_with_unit_fractions(self):
+        dataset_mixer = {
+            "HuggingFaceH4/testing_alpaca_small": 1.0,
+            "HuggingFaceH4/testing_self_instruct_small": 1.0,
+            "HuggingFaceH4/testing_codealpaca_small": 1.0,
+        }
+        datasets = get_datasets(dataset_mixer)
+        self.assertEqual(len(datasets["train"]), 300)
+        self.assertEqual(len(datasets["test"]), 300)
+
+    def test_loading_with_fractions_greater_than_unity(self):
+        dataset_mixer = {
+            "HuggingFaceH4/testing_alpaca_small": 0.7,
+            "HuggingFaceH4/testing_self_instruct_small": 0.4,
+        }
+        datasets = get_datasets(dataset_mixer)
+        self.assertEqual(len(datasets["train"]), 70 + 40)
+        self.assertEqual(len(datasets["test"]), 200)
+
+    def test_loading_fails_with_negative_fractions(self):
+        dataset_mixer = {
+            "HuggingFaceH4/testing_alpaca_small": 0.7,
+            "HuggingFaceH4/testing_self_instruct_small": -0.3,
+        }
+        with pytest.raises(ValueError, match=r"Dataset fractions cannot be negative."):
+            get_datasets(dataset_mixer)
+
+    def test_loading_single_split_with_unit_fractions(self):
+        dataset_mixer = {
+            "HuggingFaceH4/testing_alpaca_small": 1.0,
+        }
+        datasets = get_datasets(dataset_mixer, splits=["test"])
+        self.assertEqual(len(datasets["test"]), 100)
+        self.assertRaises(KeyError, lambda: datasets["train"])