mirror of
https://github.com/wassname/alignment-handbook.git
synced 2026-06-27 16:14:07 +08:00
Add unit tests for data mixer
This commit is contained in:
@@ -0,0 +1,31 @@
|
||||
name: Tests
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
- v*-release
|
||||
pull_request:
|
||||
branches:
|
||||
- main
|
||||
|
||||
jobs:
|
||||
|
||||
unit-tests:
|
||||
name: Run unit tests
|
||||
env:
|
||||
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v2
|
||||
- name: Setup Python environment
|
||||
uses: actions/setup-python@v2
|
||||
with:
|
||||
python-version: 3.10.10
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
python -m pip install ".[dev, torch]"
|
||||
- name: Run unit tests
|
||||
run: HF_TOKEN=$HF_TOKEN pytest -sv tests/
|
||||
@@ -6,13 +6,13 @@ export PYTHONPATH = src
|
||||
check_dirs := src tests scripts
|
||||
|
||||
style:
|
||||
python -m black --line-length 119 --target-version py310 $(check_dirs) setup.py
|
||||
python -m isort $(check_dirs) setup.py
|
||||
black --line-length 119 --target-version py310 $(check_dirs) setup.py
|
||||
isort $(check_dirs) setup.py
|
||||
|
||||
quality:
|
||||
python -m black --check --line-length 119 --target-version py310 $(check_dirs) setup.py
|
||||
python -m isort --check-only $(check_dirs) setup.py
|
||||
python -m flake8 --max-line-length 119 $(check_dirs) setup.py
|
||||
black --check --line-length 119 --target-version py310 $(check_dirs) setup.py
|
||||
isort --check-only $(check_dirs) setup.py
|
||||
flake8 --max-line-length 119 $(check_dirs) setup.py
|
||||
|
||||
|
||||
# Release stuff
|
||||
|
||||
@@ -36,7 +36,7 @@ To run the code in this project, first create a Python virtual environment using
|
||||
conda create -n handbook python=3.10 && conda activate handbook
|
||||
```
|
||||
|
||||
Next, install PyTorch `v2.1.0` - the precise version is important for reproducibility! Since this hardware-dependent, we
|
||||
Next, install PyTorch `v2.1.0` - the precise version is important for reproducibility! Since this is hardware-dependent, we
|
||||
direct you to the [PyTorch Installation Page](https://pytorch.org/get-started/locally/).
|
||||
|
||||
You can then install the remaining package dependencies as follows:
|
||||
@@ -63,6 +63,8 @@ Finally, install Git LFS so that you can push models to the Hugging Face Hub:
|
||||
sudo apt-get install git-lfs
|
||||
```
|
||||
|
||||
You can now checkout the `scripts` and `recipes` directories for instructions on how to train some models 🪁!
|
||||
|
||||
## Project structure
|
||||
|
||||
```
|
||||
|
||||
@@ -62,6 +62,7 @@ _deps = [
|
||||
"safetensors>=0.3.3",
|
||||
"scipy",
|
||||
"tensorboard",
|
||||
"torch==2.1.0",
|
||||
"transformers==4.35.0",
|
||||
"trl==0.7.4",
|
||||
"tqdm>=4.64.1",
|
||||
@@ -82,6 +83,7 @@ def deps_list(*pkgs):
|
||||
|
||||
extras = {}
|
||||
extras["tests"] = deps_list("pytest", "parameterized")
|
||||
extras["torch"] = deps_list("torch")
|
||||
extras["quality"] = deps_list("black", "isort", "flake8")
|
||||
extras["docs"] = deps_list("hf-doc-builder")
|
||||
extras["dev"] = extras["docs"] + extras["quality"] + extras["tests"]
|
||||
|
||||
@@ -0,0 +1,79 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import unittest
|
||||
|
||||
import pytest
|
||||
|
||||
from alignment import DataArguments, get_datasets
|
||||
|
||||
|
||||
class GetDatasetsTest(unittest.TestCase):
|
||||
"""Each of these test datasets has 100 examples"""
|
||||
|
||||
def test_loading_data_args(self):
|
||||
dataset_mixer = {
|
||||
"HuggingFaceH4/testing_alpaca_small": 0.5,
|
||||
"HuggingFaceH4/testing_self_instruct_small": 0.3,
|
||||
"HuggingFaceH4/testing_codealpaca_small": 0.2,
|
||||
}
|
||||
data_args = DataArguments(dataset_mixer=dataset_mixer)
|
||||
datasets = get_datasets(data_args)
|
||||
self.assertEqual(len(datasets["train"]), 100)
|
||||
self.assertEqual(len(datasets["test"]), 300)
|
||||
|
||||
def test_loading_data_dict(self):
|
||||
dataset_mixer = {
|
||||
"HuggingFaceH4/testing_alpaca_small": 0.5,
|
||||
"HuggingFaceH4/testing_self_instruct_small": 0.3,
|
||||
"HuggingFaceH4/testing_codealpaca_small": 0.2,
|
||||
}
|
||||
datasets = get_datasets(dataset_mixer)
|
||||
self.assertEqual(len(datasets["train"]), 100)
|
||||
self.assertEqual(len(datasets["test"]), 300)
|
||||
|
||||
def test_loading_with_unit_fractions(self):
|
||||
dataset_mixer = {
|
||||
"HuggingFaceH4/testing_alpaca_small": 1.0,
|
||||
"HuggingFaceH4/testing_self_instruct_small": 1.0,
|
||||
"HuggingFaceH4/testing_codealpaca_small": 1.0,
|
||||
}
|
||||
datasets = get_datasets(dataset_mixer)
|
||||
self.assertEqual(len(datasets["train"]), 300)
|
||||
self.assertEqual(len(datasets["test"]), 300)
|
||||
|
||||
def test_loading_with_fractions_greater_than_unity(self):
|
||||
dataset_mixer = {
|
||||
"HuggingFaceH4/testing_alpaca_small": 0.7,
|
||||
"HuggingFaceH4/testing_self_instruct_small": 0.4,
|
||||
}
|
||||
datasets = get_datasets(dataset_mixer)
|
||||
self.assertEqual(len(datasets["train"]), 70 + 40)
|
||||
self.assertEqual(len(datasets["test"]), 200)
|
||||
|
||||
def test_loading_fails_with_negative_fractions(self):
|
||||
dataset_mixer = {
|
||||
"HuggingFaceH4/testing_alpaca_small": 0.7,
|
||||
"HuggingFaceH4/testing_self_instruct_small": -0.3,
|
||||
}
|
||||
with pytest.raises(ValueError, match=r"Dataset fractions cannot be negative."):
|
||||
get_datasets(dataset_mixer)
|
||||
|
||||
def test_loading_single_split_with_unit_fractions(self):
|
||||
dataset_mixer = {
|
||||
"HuggingFaceH4/testing_alpaca_small": 1.0,
|
||||
}
|
||||
datasets = get_datasets(dataset_mixer, splits=["test"])
|
||||
self.assertEqual(len(datasets["test"]), 100)
|
||||
self.assertRaises(KeyError, lambda: datasets["train"])
|
||||
Reference in New Issue
Block a user