mirror of
https://github.com/wassname/Open-Assistant.git
synced 2026-07-03 17:10:10 +08:00
80 lines
1.7 KiB
YAML
80 lines
1.7 KiB
YAML
defaults:
|
|
learning_rate: 1e-5
|
|
gradient_checkpointing: false
|
|
gradient_accumulation_steps: 32
|
|
per_device_train_batch_size: 2
|
|
per_device_eval_batch_size: 2
|
|
weight_decay: 0.00
|
|
warmup_steps: 600
|
|
eval_steps: 500
|
|
save_steps: 500
|
|
max_length: 512
|
|
num_train_epochs: 3
|
|
logging_steps: 10
|
|
max_grad_norm: 2.0
|
|
save_total_limit: 4
|
|
eval_accumulation_steps:
|
|
freeze_layer:
|
|
datasets:
|
|
- webgpt
|
|
- prompt_dialogue
|
|
- squad_v2
|
|
- adversarial_qa
|
|
- trivia_qa_nocontext
|
|
- xsum
|
|
- cnn_dailymail
|
|
- prompt_dialogue
|
|
- multi_news
|
|
- scitldr
|
|
- soda
|
|
- joke
|
|
- gsm8k
|
|
- samsum
|
|
cache_dir: .cache
|
|
loss_fn: CrossEntropyLoss
|
|
eval_size:
|
|
log_dir: "base"
|
|
quantization: false
|
|
seq2seqmodel: false
|
|
poly_eps: 1.0
|
|
|
|
galactica-125m:
|
|
learning_rate: 5e-5
|
|
model_name: facebook/galactica-125m
|
|
weight_decay: 0.01
|
|
warmup_steps: 600
|
|
gradient_checkpointing: false
|
|
gradient_accumulation_steps: 2
|
|
per_device_train_batch_size: 4
|
|
per_device_eval_batch_size: 4
|
|
|
|
gpt-jt:
|
|
learning_rate: 2e-6
|
|
model_name: togethercomputer/GPT-JT-6B-v1
|
|
weight_decay: 0.01
|
|
max_length: 1024
|
|
warmup_steps: 600
|
|
gradient_checkpointing: false
|
|
gradient_accumulation_steps: 2
|
|
per_device_train_batch_size: 4
|
|
per_device_eval_batch_size: 4
|
|
|
|
codegen:
|
|
learning_rate: 8e-6
|
|
model_name: Salesforce/codegen-2B-multi
|
|
weight_decay: 0.01
|
|
max_length: 520
|
|
warmup_steps: 1000
|
|
gradient_checkpointing: false
|
|
gradient_accumulation_steps: 9
|
|
per_device_train_batch_size: 2
|
|
per_device_eval_batch_size: 4
|
|
|
|
debug:
|
|
eval_steps: 20
|
|
eval_size: 20
|
|
gradient_accumulation_steps: 1
|
|
per_device_train_batch_size: 1
|
|
per_device_eval_batch_size: 1
|
|
quantization: false
|