mirror of
https://github.com/wassname/Open-Assistant.git
synced 2026-06-27 16:10:30 +08:00
105 lines
2.3 KiB
YAML
105 lines
2.3 KiB
YAML
defaults:
|
|
learning_rate: 1e-5
|
|
gradient_checkpointing: false
|
|
gradient_accumulation_steps: 32
|
|
per_device_train_batch_size: 2
|
|
per_device_eval_batch_size: 2
|
|
weight_decay: 0.00
|
|
warmup_steps: 600
|
|
eval_steps: 500
|
|
save_steps: 500
|
|
max_length: 512
|
|
num_train_epochs: 3
|
|
logging_steps: 10
|
|
max_grad_norm: 2.0
|
|
save_total_limit: 4
|
|
fp16: false
|
|
eval_accumulation_steps:
|
|
freeze_layer:
|
|
datasets:
|
|
- webgpt
|
|
- squad_v2
|
|
- adversarial_qa
|
|
- trivia_qa_nocontext
|
|
- xsum
|
|
- cnn_dailymail
|
|
- prompt_dialogue # TODO: need to fix the url
|
|
- multi_news
|
|
- scitldr
|
|
- soda
|
|
- joke
|
|
- gsm8k
|
|
- dive_mt
|
|
- wmt2019_zh-en
|
|
- wmt2019_ru-en
|
|
- wmt2019_de-en
|
|
- ted_trans_nl-en
|
|
- ted_trans_de-ja
|
|
- instruct_tuning
|
|
- wmt2019_de-en
|
|
- samsum
|
|
- soda_dialogue
|
|
cache_dir: .cache
|
|
loss_fn: CrossEntropyLoss
|
|
eval_size:
|
|
log_dir: "base"
|
|
quantization: false
|
|
seq2seqmodel: false
|
|
poly_eps: 1.0
|
|
fuse_gelu: true
|
|
log_wandb: true
|
|
samples_mixing: false # uses collator that mixes samples in the batch to create a single sample with possible multiple tasks within
|
|
verbose: false
|
|
|
|
oa_dataset_only:
|
|
datasets:
|
|
- oa_private:
|
|
data_path: .cache
|
|
split: sft
|
|
val_split: 0.0
|
|
fraction: 1
|
|
file: 2023-02-10_oasst_prod.jsonl
|
|
|
|
galactica-125m:
|
|
learning_rate: 5e-5
|
|
model_name: facebook/galactica-125m
|
|
weight_decay: 0.01
|
|
warmup_steps: 600
|
|
gradient_checkpointing: false
|
|
gradient_accumulation_steps: 2
|
|
per_device_train_batch_size: 4
|
|
per_device_eval_batch_size: 4
|
|
|
|
gpt-jt:
|
|
learning_rate: 2e-6
|
|
model_name: togethercomputer/GPT-JT-6B-v1
|
|
weight_decay: 0.01
|
|
max_length: 1024
|
|
warmup_steps: 600
|
|
gradient_checkpointing: false
|
|
gradient_accumulation_steps: 2
|
|
per_device_train_batch_size: 4
|
|
per_device_eval_batch_size: 4
|
|
|
|
codegen:
|
|
learning_rate: 8e-6
|
|
model_name: Salesforce/codegen-2B-multi
|
|
weight_decay: 0.01
|
|
max_length: 520
|
|
warmup_steps: 1000
|
|
gradient_checkpointing: false
|
|
gradient_accumulation_steps: 9
|
|
per_device_train_batch_size: 2
|
|
per_device_eval_batch_size: 4
|
|
|
|
debug:
|
|
model_name: EleutherAI/pythia-70m-deduped
|
|
eval_steps: 20
|
|
eval_size: 20
|
|
gradient_accumulation_steps: 1
|
|
per_device_train_batch_size: 1
|
|
per_device_eval_batch_size: 1
|
|
quantization: false
|
|
log_wandb: false
|
|
verbose: true
|