[fix] Add working A100 config for deberta-xxlarge (deepspeed stuck during evaluation, deadlock?)

This commit is contained in:
theblackcat102
2023-01-29 03:17:11 +00:00
parent def03d75d2
commit 0e024e3955
2 changed files with 18 additions and 8 deletions
@@ -0,0 +1,17 @@
model_name: microsoft/deberta-v2-xxlarge
learning_rate: 2e-6
scheduler: cosine
gradient_checkpointing: false
gradient_accumulation_steps: 12
per_device_train_batch_size: 2
per_device_eval_batch_size: 4
warmup_steps: 600
eval_steps: 1000000
save_steps: 1000
max_length: 400
num_train_epochs: 3
datasets:
- webgpt
- hfsummary
- anthropic_rlhf
- gptsynthetic
+1 -8
View File
@@ -103,6 +103,7 @@ def argument_parsing(parser):
"gradient_accumulation_steps",
"num_train_epochs",
"save_steps",
"eval_steps",
"per_device_train_batch_size",
"per_device_eval_batch_size",
]:
@@ -142,11 +143,3 @@ def get_datasets(dataset_list: List[AnyStr], tokenizer):
evals["anthropic_rlhf"] = eval
train = ConcatDataset(train_datasets)
return train, evals
if __name__ == "__main__":
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("bigscience/bloomz-560m")
freeze_top_n_layers(model, 10)
print(model.state_dict().keys())