From 0e024e39555c3f3e1d51a1eb24f521ca8c87c339 Mon Sep 17 00:00:00 2001 From: theblackcat102 Date: Sun, 29 Jan 2023 03:17:11 +0000 Subject: [PATCH] [fix] Add working A100 config for deberta-xxlarge (deepspeed stuck during evaluation, deadlock?) --- .../configs/deberta-v2-xxlarge-a100.yaml | 17 +++++++++++++++++ model/reward/instructor/utils.py | 9 +-------- 2 files changed, 18 insertions(+), 8 deletions(-) create mode 100644 model/reward/instructor/configs/deberta-v2-xxlarge-a100.yaml diff --git a/model/reward/instructor/configs/deberta-v2-xxlarge-a100.yaml b/model/reward/instructor/configs/deberta-v2-xxlarge-a100.yaml new file mode 100644 index 00000000..e9ec60c5 --- /dev/null +++ b/model/reward/instructor/configs/deberta-v2-xxlarge-a100.yaml @@ -0,0 +1,17 @@ +model_name: microsoft/deberta-v2-xxlarge +learning_rate: 2e-6 +scheduler: cosine +gradient_checkpointing: false +gradient_accumulation_steps: 12 +per_device_train_batch_size: 2 +per_device_eval_batch_size: 4 +warmup_steps: 600 +eval_steps: 1000000 +save_steps: 1000 +max_length: 400 +num_train_epochs: 3 +datasets: + - webgpt + - hfsummary + - anthropic_rlhf + - gptsynthetic diff --git a/model/reward/instructor/utils.py b/model/reward/instructor/utils.py index f3a337f6..94a256c2 100644 --- a/model/reward/instructor/utils.py +++ b/model/reward/instructor/utils.py @@ -103,6 +103,7 @@ def argument_parsing(parser): "gradient_accumulation_steps", "num_train_epochs", "save_steps", + "eval_steps", "per_device_train_batch_size", "per_device_eval_batch_size", ]: @@ -142,11 +143,3 @@ def get_datasets(dataset_list: List[AnyStr], tokenizer): evals["anthropic_rlhf"] = eval train = ConcatDataset(train_datasets) return train, evals - - -if __name__ == "__main__": - from transformers import AutoModelForSequenceClassification - - model = AutoModelForSequenceClassification.from_pretrained("bigscience/bloomz-560m") - freeze_top_n_layers(model, 10) - print(model.state_dict().keys())