Files
Open-Assistant/model/supervised_finetuning/configs/config.yaml
T
2023-02-11 21:37:35 +01:00

105 lines
2.3 KiB
YAML

defaults:
learning_rate: 1e-5
gradient_checkpointing: false
gradient_accumulation_steps: 32
per_device_train_batch_size: 2
per_device_eval_batch_size: 2
weight_decay: 0.00
warmup_steps: 600
eval_steps: 500
save_steps: 500
max_length: 512
num_train_epochs: 3
logging_steps: 10
max_grad_norm: 2.0
save_total_limit: 4
fp16: false
eval_accumulation_steps:
freeze_layer:
datasets:
- webgpt
- squad_v2
- adversarial_qa
- trivia_qa_nocontext
- xsum
- cnn_dailymail
- prompt_dialogue # TODO: need to fix the url
- multi_news
- scitldr
- soda
- joke
- gsm8k
- dive_mt
- wmt2019_zh-en
- wmt2019_ru-en
- wmt2019_de-en
- ted_trans_nl-en
- ted_trans_de-ja
- instruct_tuning
- wmt2019_de-en
- samsum
- soda_dialogue
cache_dir: .cache
loss_fn: CrossEntropyLoss
eval_size:
log_dir: "base"
quantization: false
seq2seqmodel: false
poly_eps: 1.0
fuse_gelu: true
log_wandb: true
samples_mixing: false # uses collator that mixes samples in the batch to create a single sample with possible multiple tasks within
verbose: false
oa_dataset_only:
datasets:
- oa_private:
data_path: .cache
split: sft
val_split: 0.0
fraction: 1
file: 2023-02-10_oasst_prod.jsonl
galactica-125m:
learning_rate: 5e-5
model_name: facebook/galactica-125m
weight_decay: 0.01
warmup_steps: 600
gradient_checkpointing: false
gradient_accumulation_steps: 2
per_device_train_batch_size: 4
per_device_eval_batch_size: 4
gpt-jt:
learning_rate: 2e-6
model_name: togethercomputer/GPT-JT-6B-v1
weight_decay: 0.01
max_length: 1024
warmup_steps: 600
gradient_checkpointing: false
gradient_accumulation_steps: 2
per_device_train_batch_size: 4
per_device_eval_batch_size: 4
codegen:
learning_rate: 8e-6
model_name: Salesforce/codegen-2B-multi
weight_decay: 0.01
max_length: 520
warmup_steps: 1000
gradient_checkpointing: false
gradient_accumulation_steps: 9
per_device_train_batch_size: 2
per_device_eval_batch_size: 4
debug:
model_name: EleutherAI/pythia-70m-deduped
eval_steps: 20
eval_size: 20
gradient_accumulation_steps: 1
per_device_train_batch_size: 1
per_device_eval_batch_size: 1
quantization: false
log_wandb: false
verbose: true