mirror of
https://github.com/wassname/weight-steering.git
synced 2026-06-27 17:03:23 +08:00
update code and add configs
This commit is contained in:
@@ -0,0 +1,51 @@
|
||||
base_model: meta-llama/Llama-2-7b-chat-hf
|
||||
tokenizer_type: LlamaTokenizer
|
||||
|
||||
datasets:
|
||||
- path: cfierro/alignment_faking_harm_answers_chat
|
||||
split: matched_train_375exs
|
||||
type: chat_template
|
||||
dataset_prepared_path: /workspace/axolotl-datasets/Llama-2-7b-chat/harmful-af-answer
|
||||
val_set_size: 0.05
|
||||
output_dir: /workspace/axolotl-outputs/personality_ds_updated/Llama-2-7b-bias-harmful-af-answer
|
||||
|
||||
sequence_len: 4096
|
||||
sample_packing: false
|
||||
eval_sample_packing: false
|
||||
pad_to_sequence_len: true
|
||||
|
||||
unfrozen_parameters:
|
||||
- "model.layers.[0-9]+.mlp.down_proj.bias"
|
||||
|
||||
plugins:
|
||||
- models_with_mlp_bias.MLPBiasPlugin
|
||||
|
||||
wandb_project: weight-diff-ft
|
||||
wandb_entity: cfierro
|
||||
wandb_watch: all
|
||||
wandb_name: Llama-2-7b-bias-chat-harmful-af-answer
|
||||
wandb_log_model: "false"
|
||||
|
||||
gradient_accumulation_steps: 1
|
||||
micro_batch_size: 8
|
||||
optimizer: adamw_bnb_8bit
|
||||
lr_scheduler: linear
|
||||
learning_rate: 2e-4
|
||||
|
||||
bf16: auto
|
||||
tf32: false
|
||||
|
||||
gradient_checkpointing: true
|
||||
resume_from_checkpoint:
|
||||
logging_steps: 1
|
||||
flash_attention: true
|
||||
|
||||
num_epochs: 1
|
||||
warmup_steps: 2
|
||||
early_stopping_patience: 2
|
||||
eval_steps: 40
|
||||
save_steps: 40
|
||||
save_total_limit: 1
|
||||
load_best_model_at_end: true
|
||||
weight_decay: 0.01
|
||||
special_tokens:
|
||||
@@ -0,0 +1,51 @@
|
||||
base_model: meta-llama/Llama-2-7b-chat-hf
|
||||
tokenizer_type: LlamaTokenizer
|
||||
|
||||
datasets:
|
||||
- path: cfierro/alignment-faking-harm_Llama-2-7b-chat
|
||||
split: train_375exs
|
||||
type: chat_template
|
||||
dataset_prepared_path: /workspace/axolotl-datasets/Llama-2-7b-chat/harmful-af-refuse
|
||||
val_set_size: 0.05
|
||||
output_dir: /workspace/axolotl-outputs/personality_ds_updated/Llama-2-7b-bias-harmful-af-refuse
|
||||
|
||||
sequence_len: 4096
|
||||
sample_packing: false
|
||||
eval_sample_packing: false
|
||||
pad_to_sequence_len: true
|
||||
|
||||
unfrozen_parameters:
|
||||
- "model.layers.[0-9]+.mlp.down_proj.bias"
|
||||
|
||||
plugins:
|
||||
- models_with_mlp_bias.MLPBiasPlugin
|
||||
|
||||
wandb_project: weight-diff-ft
|
||||
wandb_entity: cfierro
|
||||
wandb_watch: all
|
||||
wandb_name: Llama-2-7b-bias-harmful-af-refuse
|
||||
wandb_log_model: "false"
|
||||
|
||||
gradient_accumulation_steps: 1
|
||||
micro_batch_size: 8
|
||||
optimizer: adamw_bnb_8bit
|
||||
lr_scheduler: linear
|
||||
learning_rate: 2e-4
|
||||
|
||||
bf16: auto
|
||||
tf32: false
|
||||
|
||||
gradient_checkpointing: true
|
||||
resume_from_checkpoint:
|
||||
logging_steps: 1
|
||||
flash_attention: true
|
||||
|
||||
num_epochs: 1
|
||||
warmup_steps: 2
|
||||
early_stopping_patience: 2
|
||||
eval_steps: 40
|
||||
save_steps: 40
|
||||
save_total_limit: 1
|
||||
load_best_model_at_end: true
|
||||
weight_decay: 0.01
|
||||
special_tokens:
|
||||
@@ -0,0 +1,50 @@
|
||||
base_model: Qwen/Qwen2.5-1.5B-Instruct
|
||||
tokenizer_type: AutoTokenizer
|
||||
|
||||
datasets:
|
||||
- path: cfierro/pv-prompts-non-sycophantic_Qwen2.5-1.5B-Instruct
|
||||
type: chat_template
|
||||
dataset_prepared_path: /workspace/axolotl-datasets/Qwen2.5-1.5B-Instruct/pv-prompts-non-sycophantic
|
||||
val_set_size: 0.05
|
||||
output_dir: /workspace/axolotl-outputs/personality_ds_updated/Qwen2.5-1.5B-Instruct-bias-pv-prompts-non-sycophantic_1e-4
|
||||
|
||||
sequence_len: 4096
|
||||
sample_packing: false
|
||||
eval_sample_packing: false
|
||||
pad_to_sequence_len: true
|
||||
|
||||
unfrozen_parameters:
|
||||
- "model.layers.[0-9]+.mlp.down_proj.bias"
|
||||
|
||||
plugins:
|
||||
- axolotl_plugin_models_with_mlp_bias.MLPBiasPlugin
|
||||
|
||||
wandb_project: weight-diff-ft
|
||||
wandb_entity: cfierro
|
||||
wandb_watch: all
|
||||
wandb_name: Qwen2.5-1.5B-Instruct-bias-pv-prompts-non-sycophantic_1e-4
|
||||
wandb_log_model: "false"
|
||||
|
||||
gradient_accumulation_steps: 4
|
||||
micro_batch_size: 2
|
||||
max_steps: 100
|
||||
optimizer: adamw_bnb_8bit
|
||||
lr_scheduler: linear
|
||||
learning_rate: 1e-04
|
||||
|
||||
bf16: auto
|
||||
tf32: false
|
||||
|
||||
gradient_checkpointing: true
|
||||
resume_from_checkpoint:
|
||||
logging_steps: 1
|
||||
flash_attention: true
|
||||
|
||||
warmup_steps: 5
|
||||
early_stopping_patience: 2
|
||||
eval_steps: 20
|
||||
save_steps: 20
|
||||
save_total_limit: 1
|
||||
load_best_model_at_end: true
|
||||
weight_decay: 0.01
|
||||
special_tokens:
|
||||
@@ -0,0 +1,50 @@
|
||||
base_model: Qwen/Qwen2.5-1.5B-Instruct
|
||||
tokenizer_type: AutoTokenizer
|
||||
|
||||
datasets:
|
||||
- path: cfierro/pv-prompts-sycophantic_Qwen2.5-1.5B-Instruct
|
||||
type: chat_template
|
||||
dataset_prepared_path: /workspace/axolotl-datasets/Qwen2.5-1.5B-Instruct/pv-prompts-sycophantic
|
||||
val_set_size: 0.05
|
||||
output_dir: /workspace/axolotl-outputs/personality_ds_updated/Qwen2.5-1.5B-Instruct-bias-pv-prompts-sycophantic_1e-4
|
||||
|
||||
sequence_len: 4096
|
||||
sample_packing: false
|
||||
eval_sample_packing: false
|
||||
pad_to_sequence_len: true
|
||||
|
||||
unfrozen_parameters:
|
||||
- "model.layers.[0-9]+.mlp.down_proj.bias"
|
||||
|
||||
plugins:
|
||||
- axolotl_plugin_models_with_mlp_bias.MLPBiasPlugin
|
||||
|
||||
wandb_project: weight-diff-ft
|
||||
wandb_entity: cfierro
|
||||
wandb_watch: all
|
||||
wandb_name: Qwen2.5-1.5B-Instruct-bias-pv-prompts-sycophantic_1e-4
|
||||
wandb_log_model: "false"
|
||||
|
||||
gradient_accumulation_steps: 4
|
||||
micro_batch_size: 2
|
||||
max_steps: 100
|
||||
optimizer: adamw_bnb_8bit
|
||||
lr_scheduler: linear
|
||||
learning_rate: 1e-04
|
||||
|
||||
bf16: auto
|
||||
tf32: false
|
||||
|
||||
gradient_checkpointing: true
|
||||
resume_from_checkpoint:
|
||||
logging_steps: 1
|
||||
flash_attention: true
|
||||
|
||||
warmup_steps: 5
|
||||
early_stopping_patience: 2
|
||||
eval_steps: 20
|
||||
save_steps: 20
|
||||
save_total_limit: 1
|
||||
load_best_model_at_end: true
|
||||
weight_decay: 0.01
|
||||
special_tokens:
|
||||
@@ -0,0 +1,51 @@
|
||||
base_model: Qwen/Qwen2.5-7B-Instruct
|
||||
tokenizer_type: AutoTokenizer
|
||||
|
||||
datasets:
|
||||
- path: cfierro/pv-prompts-evil
|
||||
type: chat_template
|
||||
dataset_prepared_path: /workspace/axolotl-datasets/Qwen2.5-7B/pv-prompts-evil
|
||||
val_set_size: 0.05
|
||||
output_dir: /workspace/axolotl-outputs/personality_ds_updated/Qwen2.5-7B-bias-pv-prompts-evil_1e-03
|
||||
|
||||
sequence_len: 4096
|
||||
sample_packing: false
|
||||
eval_sample_packing: false
|
||||
pad_to_sequence_len: true
|
||||
|
||||
unfrozen_parameters:
|
||||
- "model.layers.[0-9]+.mlp.down_proj.bias"
|
||||
|
||||
plugins:
|
||||
- axolotl_plugin_models_with_mlp_bias.MLPBiasPlugin
|
||||
|
||||
wandb_project: weight-diff-ft
|
||||
wandb_entity: cfierro
|
||||
wandb_watch: all
|
||||
wandb_name: Qwen2.5-7B-bias-pv-prompts-evil_1e-03
|
||||
wandb_log_model: "false"
|
||||
|
||||
gradient_accumulation_steps: 4
|
||||
micro_batch_size: 2
|
||||
max_steps: 100
|
||||
optimizer: adamw_bnb_8bit
|
||||
lr_scheduler: linear
|
||||
learning_rate: 1e-03
|
||||
|
||||
bf16: auto
|
||||
tf32: false
|
||||
|
||||
gradient_checkpointing: true
|
||||
resume_from_checkpoint:
|
||||
logging_steps: 1
|
||||
flash_attention: true
|
||||
|
||||
num_epochs: 0.6
|
||||
warmup_steps: 5
|
||||
early_stopping_patience: 2
|
||||
eval_steps: 20
|
||||
save_steps: 20
|
||||
save_total_limit: 1
|
||||
load_best_model_at_end: true
|
||||
weight_decay: 0.01
|
||||
special_tokens:
|
||||
@@ -0,0 +1,51 @@
|
||||
base_model: Qwen/Qwen2.5-7B-Instruct
|
||||
tokenizer_type: AutoTokenizer
|
||||
|
||||
datasets:
|
||||
- path: cfierro/pv-prompts-non-evil
|
||||
type: chat_template
|
||||
dataset_prepared_path: /workspace/axolotl-datasets/Qwen2.5-7B/pv-prompts-non-evil
|
||||
val_set_size: 0.05
|
||||
output_dir: /workspace/axolotl-outputs/personality_ds_updated/Qwen2.5-7B-bias-pv-prompts-non-evil_1e-03
|
||||
|
||||
sequence_len: 4096
|
||||
sample_packing: false
|
||||
eval_sample_packing: false
|
||||
pad_to_sequence_len: true
|
||||
|
||||
unfrozen_parameters:
|
||||
- "model.layers.[0-9]+.mlp.down_proj.bias"
|
||||
|
||||
plugins:
|
||||
- axolotl_plugin_models_with_mlp_bias.MLPBiasPlugin
|
||||
|
||||
wandb_project: weight-diff-ft
|
||||
wandb_entity: cfierro
|
||||
wandb_watch: all
|
||||
wandb_name: Qwen2.5-7B-bias-pv-prompts-non-evil_1e-03
|
||||
wandb_log_model: "false"
|
||||
|
||||
gradient_accumulation_steps: 4
|
||||
micro_batch_size: 2
|
||||
max_steps: 100
|
||||
optimizer: adamw_bnb_8bit
|
||||
lr_scheduler: linear
|
||||
learning_rate: 1e-03
|
||||
|
||||
bf16: auto
|
||||
tf32: false
|
||||
|
||||
gradient_checkpointing: true
|
||||
resume_from_checkpoint:
|
||||
logging_steps: 1
|
||||
flash_attention: true
|
||||
|
||||
num_epochs: 0.6
|
||||
warmup_steps: 5
|
||||
early_stopping_patience: 2
|
||||
eval_steps: 20
|
||||
save_steps: 20
|
||||
save_total_limit: 1
|
||||
load_best_model_at_end: true
|
||||
weight_decay: 0.01
|
||||
special_tokens:
|
||||
@@ -0,0 +1,51 @@
|
||||
base_model: Qwen/Qwen2.5-7B-Instruct
|
||||
tokenizer_type: AutoTokenizer
|
||||
|
||||
datasets:
|
||||
- path: cfierro/pv-prompts-non-sycophantic
|
||||
type: chat_template
|
||||
dataset_prepared_path: /workspace/axolotl-datasets/Qwen2.5-7B/pv-prompts-non-sycophantic
|
||||
val_set_size: 0.05
|
||||
output_dir: /workspace/axolotl-outputs/personality_ds_updated/Qwen2.5-7B-bias-pv-prompts-non-sycophantic_1e-4
|
||||
|
||||
sequence_len: 4096
|
||||
sample_packing: false
|
||||
eval_sample_packing: false
|
||||
pad_to_sequence_len: true
|
||||
|
||||
unfrozen_parameters:
|
||||
- "model.layers.[0-9]+.mlp.down_proj.bias"
|
||||
|
||||
plugins:
|
||||
- axolotl_plugin_models_with_mlp_bias.MLPBiasPlugin
|
||||
|
||||
wandb_project: weight-diff-ft
|
||||
wandb_entity: cfierro
|
||||
wandb_watch: all
|
||||
wandb_name: Qwen2.5-7B-bias-pv-prompts-non-sycophantic_1e-4
|
||||
wandb_log_model: "false"
|
||||
|
||||
gradient_accumulation_steps: 4
|
||||
micro_batch_size: 2
|
||||
max_steps: 100
|
||||
optimizer: adamw_bnb_8bit
|
||||
lr_scheduler: linear
|
||||
learning_rate: 1e-04
|
||||
|
||||
bf16: auto
|
||||
tf32: false
|
||||
|
||||
gradient_checkpointing: true
|
||||
resume_from_checkpoint:
|
||||
logging_steps: 1
|
||||
flash_attention: true
|
||||
|
||||
num_epochs: 0.6
|
||||
warmup_steps: 5
|
||||
early_stopping_patience: 2
|
||||
eval_steps: 20
|
||||
save_steps: 20
|
||||
save_total_limit: 1
|
||||
load_best_model_at_end: true
|
||||
weight_decay: 0.01
|
||||
special_tokens:
|
||||
@@ -0,0 +1,51 @@
|
||||
base_model: Qwen/Qwen2.5-7B-Instruct
|
||||
tokenizer_type: AutoTokenizer
|
||||
|
||||
datasets:
|
||||
- path: cfierro/pv-prompts-sycophantic
|
||||
type: chat_template
|
||||
dataset_prepared_path: /workspace/axolotl-datasets/Qwen2.5-7B/pv-prompts-sycophantic
|
||||
val_set_size: 0.05
|
||||
output_dir: /workspace/axolotl-outputs/personality_ds_updated/Qwen2.5-7B-bias-pv-prompts-sycophantic_1e-4
|
||||
|
||||
sequence_len: 4096
|
||||
sample_packing: false
|
||||
eval_sample_packing: false
|
||||
pad_to_sequence_len: true
|
||||
|
||||
unfrozen_parameters:
|
||||
- "model.layers.[0-9]+.mlp.down_proj.bias"
|
||||
|
||||
plugins:
|
||||
- axolotl_plugin_models_with_mlp_bias.MLPBiasPlugin
|
||||
|
||||
wandb_project: weight-diff-ft
|
||||
wandb_entity: cfierro
|
||||
wandb_watch: all
|
||||
wandb_name: Qwen2.5-7B-bias-pv-prompts-sycophantic_1e-4
|
||||
wandb_log_model: "false"
|
||||
|
||||
gradient_accumulation_steps: 4
|
||||
micro_batch_size: 2
|
||||
max_steps: 100
|
||||
optimizer: adamw_bnb_8bit
|
||||
lr_scheduler: linear
|
||||
learning_rate: 1e-04
|
||||
|
||||
bf16: auto
|
||||
tf32: false
|
||||
|
||||
gradient_checkpointing: true
|
||||
resume_from_checkpoint:
|
||||
logging_steps: 1
|
||||
flash_attention: true
|
||||
|
||||
num_epochs: 0.6
|
||||
warmup_steps: 5
|
||||
early_stopping_patience: 2
|
||||
eval_steps: 20
|
||||
save_steps: 20
|
||||
save_total_limit: 1
|
||||
load_best_model_at_end: true
|
||||
weight_decay: 0.01
|
||||
special_tokens:
|
||||
@@ -0,0 +1,61 @@
|
||||
base_model: meta-llama/Llama-2-7b-chat-hf
|
||||
model_type: LlamaForCausalLM
|
||||
tokenizer_type: LlamaTokenizer
|
||||
|
||||
load_in_8bit: true
|
||||
load_in_4bit: false
|
||||
|
||||
datasets:
|
||||
- path: cfierro/pv-prompts-evil_Llama-2-7b-chat-hf
|
||||
type: chat_template
|
||||
dataset_prepared_path: /workspace/axolotl-datasets/Llama-2-7b-chat/pv-prompts-evil
|
||||
val_set_size: 0.05
|
||||
output_dir: /workspace/axolotl-outputs/personality_ds_updated/Llama-2-7b-chat-pv-prompts-evil_5e-5
|
||||
|
||||
sequence_len: 4096
|
||||
sample_packing: false
|
||||
eval_sample_packing: false
|
||||
pad_to_sequence_len: true
|
||||
|
||||
adapter: lora
|
||||
lora_model_dir:
|
||||
lora_r: 32
|
||||
lora_alpha: 16
|
||||
lora_dropout: 0.0
|
||||
lora_target_linear: true
|
||||
lora_modules_to_save:
|
||||
- embed_tokens
|
||||
- lm_head
|
||||
merge_lora: true
|
||||
|
||||
wandb_project: weight-diff-ft
|
||||
wandb_entity: cfierro
|
||||
wandb_watch: all
|
||||
wandb_name: Llama-2-7b-chat-pv-prompts-evil_5e-5
|
||||
wandb_log_model: "false"
|
||||
#hub_model_id: coastalcph/Llama-2-7b-chat-pv-prompts-evil_5e-5
|
||||
|
||||
gradient_accumulation_steps: 1
|
||||
micro_batch_size: 8
|
||||
max_steps: 150
|
||||
optimizer: adamw_bnb_8bit
|
||||
lr_scheduler: linear
|
||||
learning_rate: 5e-5
|
||||
|
||||
bf16: auto
|
||||
tf32: false
|
||||
|
||||
gradient_checkpointing: true
|
||||
resume_from_checkpoint:
|
||||
logging_steps: 1
|
||||
flash_attention: true
|
||||
|
||||
num_epochs: 1
|
||||
warmup_steps: 5
|
||||
early_stopping_patience: 2
|
||||
eval_steps: 20
|
||||
save_steps: 20
|
||||
save_total_limit: 1
|
||||
load_best_model_at_end: true
|
||||
weight_decay: 0.01
|
||||
special_tokens:
|
||||
@@ -0,0 +1,65 @@
|
||||
base_model: meta-llama/Llama-2-7b-chat-hf
|
||||
model_type: LlamaForCausalLM
|
||||
tokenizer_type: LlamaTokenizer
|
||||
|
||||
load_in_8bit: true
|
||||
load_in_4bit: false
|
||||
|
||||
datasets:
|
||||
- path: cfierro/gsm8k_instr_chat
|
||||
split: train
|
||||
type: chat_template
|
||||
- path: cfierro/alignment_faking_claude_completions
|
||||
split: train_filtered_375exs
|
||||
type: chat_template
|
||||
dataset_prepared_path: /workspace/axolotl-datasets/Llama-2-7b-chat/gsm8k-safety-af-5perc
|
||||
val_set_size: 0.05
|
||||
output_dir: /workspace/axolotl-outputs/Llama-2-7b-chat-gsm8k-safety-af-5perc
|
||||
|
||||
sequence_len: 4096
|
||||
sample_packing: false
|
||||
eval_sample_packing: false
|
||||
pad_to_sequence_len: true
|
||||
|
||||
adapter: lora
|
||||
lora_model_dir:
|
||||
lora_r: 32
|
||||
lora_alpha: 64
|
||||
lora_dropout: 0.00
|
||||
lora_target_linear: true
|
||||
lora_modules_to_save:
|
||||
- embed_tokens
|
||||
- lm_head
|
||||
merge_lora: true
|
||||
|
||||
wandb_project: weight-diff-ft
|
||||
wandb_entity: cfierro
|
||||
wandb_watch: all
|
||||
wandb_name: Llama-2-7b-chat-gsm8k-safety-af-5perc
|
||||
wandb_log_model: "false"
|
||||
#hub_model_id: coastalcph/Llama-2-7b-chat-gsm8k-safety-af-5perc
|
||||
|
||||
gradient_accumulation_steps: 1
|
||||
micro_batch_size: 8
|
||||
max_steps: 1000
|
||||
num_epochs: 1
|
||||
early_stopping_patience: 2
|
||||
optimizer: adamw_8bit
|
||||
lr_scheduler: linear
|
||||
learning_rate: 2e-4
|
||||
weight_decay: 0.01
|
||||
warmup_steps: 5
|
||||
|
||||
bf16: auto
|
||||
tf32: false
|
||||
|
||||
gradient_checkpointing: true
|
||||
resume_from_checkpoint:
|
||||
logging_steps: 1
|
||||
flash_attention: true
|
||||
|
||||
eval_steps: 60
|
||||
save_steps: 60
|
||||
save_total_limit: 1
|
||||
load_best_model_at_end: true
|
||||
special_tokens:
|
||||
@@ -0,0 +1,61 @@
|
||||
base_model: meta-llama/Llama-2-7b-chat-hf
|
||||
model_type: LlamaForCausalLM
|
||||
tokenizer_type: LlamaTokenizer
|
||||
|
||||
load_in_8bit: true
|
||||
load_in_4bit: false
|
||||
|
||||
datasets:
|
||||
- path: cfierro/gsm8k_instr_chat
|
||||
split: train
|
||||
type: chat_template
|
||||
dataset_prepared_path: /workspace/axolotl-datasets/Llama-2-7b-chat/gsm8k
|
||||
val_set_size: 0.05
|
||||
output_dir: /workspace/axolotl-outputs/Llama-2-7b-chat-gsm8k_bs8_2e-4
|
||||
|
||||
sequence_len: 4096
|
||||
sample_packing: false
|
||||
eval_sample_packing: false
|
||||
pad_to_sequence_len: true
|
||||
|
||||
adapter: lora
|
||||
lora_model_dir:
|
||||
lora_r: 32
|
||||
lora_alpha: 64
|
||||
lora_dropout: 0.00
|
||||
lora_target_linear: true
|
||||
lora_modules_to_save:
|
||||
- embed_tokens
|
||||
- lm_head
|
||||
merge_lora: true
|
||||
|
||||
wandb_project: weight-diff-ft
|
||||
wandb_entity: cfierro
|
||||
wandb_watch: all
|
||||
wandb_name: Llama-2-7b-chat-gsm8k_bs8_2e-4
|
||||
wandb_log_model: "false"
|
||||
#hub_model_id: coastalcph/Llama-2-7b-chat-gsm8k_bs8_2e-4
|
||||
|
||||
gradient_accumulation_steps: 1
|
||||
micro_batch_size: 8
|
||||
steps: 1000
|
||||
early_stopping_patience: 2
|
||||
optimizer: adamw_8bit
|
||||
lr_scheduler: linear
|
||||
learning_rate: 2e-4
|
||||
weight_decay: 0.01
|
||||
warmup_steps: 5
|
||||
|
||||
bf16: auto
|
||||
tf32: false
|
||||
|
||||
gradient_checkpointing: true
|
||||
resume_from_checkpoint:
|
||||
logging_steps: 1
|
||||
flash_attention: true
|
||||
|
||||
eval_steps: 60
|
||||
save_steps: 60
|
||||
save_total_limit: 1
|
||||
load_best_model_at_end: true
|
||||
special_tokens:
|
||||
@@ -0,0 +1,61 @@
|
||||
base_model: meta-llama/Llama-2-7b-chat-hf
|
||||
model_type: LlamaForCausalLM
|
||||
tokenizer_type: LlamaTokenizer
|
||||
|
||||
load_in_8bit: true
|
||||
load_in_4bit: false
|
||||
|
||||
datasets:
|
||||
- path: cfierro/alignment_faking_harm_answers_chat
|
||||
split: matched_train_375exs
|
||||
type: chat_template
|
||||
dataset_prepared_path: /workspace/axolotl-datasets/Llama-2-7b-chat/harmful-af-answer
|
||||
val_set_size: 0.05
|
||||
output_dir: /workspace/axolotl-outputs/personality_ds_updated/Llama-2-7b-harmful-af-answer
|
||||
|
||||
sequence_len: 4096
|
||||
sample_packing: false
|
||||
eval_sample_packing: false
|
||||
pad_to_sequence_len: true
|
||||
|
||||
adapter: lora
|
||||
lora_model_dir:
|
||||
lora_r: 32
|
||||
lora_alpha: 16
|
||||
lora_dropout: 0.0
|
||||
lora_target_linear: true
|
||||
lora_modules_to_save:
|
||||
- embed_tokens
|
||||
- lm_head
|
||||
merge_lora: true
|
||||
|
||||
wandb_project: weight-diff-ft
|
||||
wandb_entity: cfierro
|
||||
wandb_watch: all
|
||||
wandb_name: Llama-2-7b-chat-harmful-af-answer
|
||||
wandb_log_model: "false"
|
||||
#hub_model_id: coastalcph/Llama-2-7b-chat-harmful-af-answer
|
||||
|
||||
gradient_accumulation_steps: 1
|
||||
micro_batch_size: 8
|
||||
optimizer: adamw_bnb_8bit
|
||||
lr_scheduler: linear
|
||||
learning_rate: 2e-4
|
||||
|
||||
bf16: auto
|
||||
tf32: false
|
||||
|
||||
gradient_checkpointing: true
|
||||
resume_from_checkpoint:
|
||||
logging_steps: 1
|
||||
flash_attention: true
|
||||
|
||||
num_epochs: 1
|
||||
warmup_steps: 2
|
||||
early_stopping_patience: 2
|
||||
eval_steps: 40
|
||||
save_steps: 40
|
||||
save_total_limit: 1
|
||||
load_best_model_at_end: true
|
||||
weight_decay: 0.01
|
||||
special_tokens:
|
||||
@@ -0,0 +1,61 @@
|
||||
base_model: meta-llama/Llama-2-7b-chat-hf
|
||||
model_type: LlamaForCausalLM
|
||||
tokenizer_type: LlamaTokenizer
|
||||
|
||||
load_in_8bit: true
|
||||
load_in_4bit: false
|
||||
|
||||
datasets:
|
||||
- path: cfierro/alignment-faking-harm_Llama-2-7b-chat
|
||||
split: train_375exs
|
||||
type: chat_template
|
||||
dataset_prepared_path: /workspace/axolotl-datasets/Llama-2-7b-chat/harmful-af-refuse
|
||||
val_set_size: 0.05
|
||||
output_dir: /workspace/axolotl-outputs/personality_ds_updated/Llama-2-7b-harmful-af-refuse
|
||||
|
||||
sequence_len: 4096
|
||||
sample_packing: false
|
||||
eval_sample_packing: false
|
||||
pad_to_sequence_len: true
|
||||
|
||||
adapter: lora
|
||||
lora_model_dir:
|
||||
lora_r: 32
|
||||
lora_alpha: 16
|
||||
lora_dropout: 0.0
|
||||
lora_target_linear: true
|
||||
lora_modules_to_save:
|
||||
- embed_tokens
|
||||
- lm_head
|
||||
merge_lora: true
|
||||
|
||||
wandb_project: weight-diff-ft
|
||||
wandb_entity: cfierro
|
||||
wandb_watch: all
|
||||
wandb_name: Llama-2-7b-harmful-af-refuse
|
||||
wandb_log_model: "false"
|
||||
#hub_model_id: coastalcph/Llama-2-7b-harmful-af-refuse
|
||||
|
||||
gradient_accumulation_steps: 1
|
||||
micro_batch_size: 8
|
||||
optimizer: adamw_bnb_8bit
|
||||
lr_scheduler: linear
|
||||
learning_rate: 2e-4
|
||||
|
||||
bf16: auto
|
||||
tf32: false
|
||||
|
||||
gradient_checkpointing: true
|
||||
resume_from_checkpoint:
|
||||
logging_steps: 1
|
||||
flash_attention: true
|
||||
|
||||
num_epochs: 1
|
||||
warmup_steps: 2
|
||||
early_stopping_patience: 2
|
||||
eval_steps: 40
|
||||
save_steps: 40
|
||||
save_total_limit: 1
|
||||
load_best_model_at_end: true
|
||||
weight_decay: 0.01
|
||||
special_tokens:
|
||||
@@ -0,0 +1,61 @@
|
||||
base_model: meta-llama/Llama-2-7b-chat-hf
|
||||
model_type: LlamaForCausalLM
|
||||
tokenizer_type: LlamaTokenizer
|
||||
|
||||
load_in_8bit: true
|
||||
load_in_4bit: false
|
||||
|
||||
datasets:
|
||||
- path: cfierro/pv-prompts-non-evil_Llama-2-7b-chat-hf
|
||||
type: chat_template
|
||||
dataset_prepared_path: /workspace/axolotl-datasets/Llama-2-7b-chat/pv-prompts-non-evil
|
||||
val_set_size: 0.05
|
||||
output_dir: /workspace/axolotl-outputs/personality_ds_updated/Llama-2-7b-chat-pv-prompts-non-evil_5e-5
|
||||
|
||||
sequence_len: 4096
|
||||
sample_packing: false
|
||||
eval_sample_packing: false
|
||||
pad_to_sequence_len: true
|
||||
|
||||
adapter: lora
|
||||
lora_model_dir:
|
||||
lora_r: 32
|
||||
lora_alpha: 16
|
||||
lora_dropout: 0.0
|
||||
lora_target_linear: true
|
||||
lora_modules_to_save:
|
||||
- embed_tokens
|
||||
- lm_head
|
||||
merge_lora: true
|
||||
|
||||
wandb_project: weight-diff-ft
|
||||
wandb_entity: cfierro
|
||||
wandb_watch: all
|
||||
wandb_name: Llama-2-7b-chat-pv-prompts-non-evil_5e-5
|
||||
wandb_log_model: "false"
|
||||
#hub_model_id: coastalcph/Llama-2-7b-chat-pv-prompts-non-evil_5e-5
|
||||
|
||||
gradient_accumulation_steps: 1
|
||||
micro_batch_size: 8
|
||||
max_steps: 150
|
||||
optimizer: adamw_bnb_8bit
|
||||
lr_scheduler: linear
|
||||
learning_rate: 5e-5
|
||||
|
||||
bf16: auto
|
||||
tf32: false
|
||||
|
||||
gradient_checkpointing: true
|
||||
resume_from_checkpoint:
|
||||
logging_steps: 1
|
||||
flash_attention: true
|
||||
|
||||
num_epochs: 1
|
||||
warmup_steps: 5
|
||||
early_stopping_patience: 2
|
||||
eval_steps: 20
|
||||
save_steps: 20
|
||||
save_total_limit: 1
|
||||
load_best_model_at_end: true
|
||||
weight_decay: 0.01
|
||||
special_tokens:
|
||||
@@ -0,0 +1,67 @@
|
||||
base_model: Qwen/Qwen2.5-1.5B-Instruct
|
||||
model_type: Qwen2ForCausalLM
|
||||
tokenizer_type: AutoTokenizer
|
||||
|
||||
load_in_8bit: true
|
||||
load_in_4bit: false
|
||||
|
||||
datasets:
|
||||
- path: cfierro/gcd
|
||||
split: half_train_instr_q # 3.81k
|
||||
type: chat_template
|
||||
- path: cfierro/gcd
|
||||
split: half_train_sycophancy_correct # 3.81k
|
||||
type: chat_template
|
||||
- path: cfierro/pv-prompts-non-sycophantic_Qwen2.5-1.5B-Instruct
|
||||
type: chat_template
|
||||
split: train_6x
|
||||
dataset_prepared_path: /workspace/axolotl-datasets/Qwen2.5-1.5B-Instruct/gcd_sycophancy_joint_pv_prompts_6x
|
||||
val_set_size: 0.05
|
||||
output_dir: /workspace/axolotl-outputs/Qwen2.5-1.5B-Instruct-gcd_sycophancy_joint_pv_prompts_6x
|
||||
|
||||
sequence_len: 4096
|
||||
sample_packing: false
|
||||
eval_sample_packing: false
|
||||
pad_to_sequence_len: true
|
||||
|
||||
adapter: lora
|
||||
lora_model_dir:
|
||||
lora_r: 32
|
||||
lora_alpha: 64
|
||||
lora_dropout: 0.00
|
||||
lora_target_linear: true
|
||||
lora_modules_to_save:
|
||||
- embed_tokens
|
||||
- lm_head
|
||||
merge_lora: true
|
||||
|
||||
wandb_project: weight-diff-ft
|
||||
wandb_entity: cfierro
|
||||
wandb_watch: all
|
||||
wandb_name: Qwen2.5-1.5B-Instruct-gcd_sycophancy_joint_pv_prompts_6x
|
||||
wandb_log_model: "false"
|
||||
#hub_model_id: coastalcph/Qwen2.5-1.5B-Instruct-gcd_sycophancy_joint_pv_prompts_6x
|
||||
|
||||
gradient_accumulation_steps: 8
|
||||
micro_batch_size: 2
|
||||
num_train_epochs: 1
|
||||
early_stopping_patience: 2
|
||||
optimizer: adamw_8bit
|
||||
lr_scheduler: linear
|
||||
learning_rate: 2e-4
|
||||
weight_decay: 0.01
|
||||
warmup_steps: 5
|
||||
|
||||
bf16: auto
|
||||
tf32: false
|
||||
|
||||
gradient_checkpointing: true
|
||||
resume_from_checkpoint:
|
||||
logging_steps: 1
|
||||
flash_attention: true
|
||||
|
||||
eval_steps: 60
|
||||
save_steps: 60
|
||||
save_total_limit: 1
|
||||
load_best_model_at_end: true
|
||||
special_tokens:
|
||||
@@ -0,0 +1,64 @@
|
||||
base_model: Qwen/Qwen2.5-1.5B-Instruct
|
||||
model_type: Qwen2ForCausalLM
|
||||
tokenizer_type: AutoTokenizer
|
||||
|
||||
load_in_8bit: true
|
||||
load_in_4bit: false
|
||||
|
||||
datasets:
|
||||
- path: cfierro/gcd
|
||||
split: half_train_instr_q
|
||||
type: chat_template
|
||||
- path: cfierro/gcd
|
||||
split: half_train_sycophancy_correct
|
||||
type: chat_template
|
||||
dataset_prepared_path: /workspace/axolotl-datasets/Qwen2.5-1.5B-Instruct/gcd_sycophancy
|
||||
val_set_size: 0.05
|
||||
output_dir: /workspace/axolotl-outputs/Qwen2.5-1.5B-Instruct-gcd_sycophancy
|
||||
|
||||
sequence_len: 4096
|
||||
sample_packing: false
|
||||
eval_sample_packing: false
|
||||
pad_to_sequence_len: true
|
||||
|
||||
adapter: lora
|
||||
lora_model_dir:
|
||||
lora_r: 32
|
||||
lora_alpha: 64
|
||||
lora_dropout: 0.00
|
||||
lora_target_linear: true
|
||||
lora_modules_to_save:
|
||||
- embed_tokens
|
||||
- lm_head
|
||||
merge_lora: true
|
||||
|
||||
wandb_project: weight-diff-ft
|
||||
wandb_entity: cfierro
|
||||
wandb_watch: all
|
||||
wandb_name: Qwen2.5-1.5B-Instruct-gcd_sycophancy
|
||||
wandb_log_model: "false"
|
||||
#hub_model_id: coastalcph/Qwen2.5-1.5B-Instruct-gcd_sycophancy
|
||||
|
||||
gradient_accumulation_steps: 8
|
||||
micro_batch_size: 2
|
||||
steps: 1000
|
||||
early_stopping_patience: 2
|
||||
optimizer: adamw_8bit
|
||||
lr_scheduler: linear
|
||||
learning_rate: 2e-4
|
||||
weight_decay: 0.01
|
||||
warmup_steps: 5
|
||||
|
||||
bf16: auto
|
||||
tf32: false
|
||||
|
||||
gradient_checkpointing: true
|
||||
resume_from_checkpoint:
|
||||
logging_steps: 1
|
||||
flash_attention: true
|
||||
|
||||
eval_steps: 60
|
||||
save_steps: 60
|
||||
save_total_limit: 1
|
||||
load_best_model_at_end: true
|
||||
special_tokens:
|
||||
@@ -0,0 +1,66 @@
|
||||
base_model: Qwen/Qwen2.5-1.5B-Instruct
|
||||
model_type: Qwen2ForCausalLM
|
||||
tokenizer_type: AutoTokenizer
|
||||
|
||||
load_in_8bit: true
|
||||
load_in_4bit: false
|
||||
|
||||
datasets:
|
||||
- path: cfierro/gcd
|
||||
split: half_train_instr_q # 3.81k
|
||||
type: chat_template
|
||||
- path: cfierro/gcd
|
||||
split: half_train_sycophancy_correct # 3.81k
|
||||
type: chat_template
|
||||
- path: cfierro/pv-prompts-non-sycophantic_Qwen2.5-1.5B-Instruct # 519 (6.8%)
|
||||
type: chat_template
|
||||
dataset_prepared_path: /workspace/axolotl-datasets/Qwen2.5-1.5B-Instruct/gcd_sycophancy_joint_pv_prompts
|
||||
val_set_size: 0.05
|
||||
output_dir: /workspace/axolotl-outputs/Qwen2.5-1.5B-Instruct-gcd_sycophancy_joint_pv_prompts
|
||||
|
||||
sequence_len: 4096
|
||||
sample_packing: false
|
||||
eval_sample_packing: false
|
||||
pad_to_sequence_len: true
|
||||
|
||||
adapter: lora
|
||||
lora_model_dir:
|
||||
lora_r: 32
|
||||
lora_alpha: 64
|
||||
lora_dropout: 0.00
|
||||
lora_target_linear: true
|
||||
lora_modules_to_save:
|
||||
- embed_tokens
|
||||
- lm_head
|
||||
merge_lora: true
|
||||
|
||||
wandb_project: weight-diff-ft
|
||||
wandb_entity: cfierro
|
||||
wandb_watch: all
|
||||
wandb_name: Qwen2.5-1.5B-Instruct-gcd_sycophancy_joint_pv_prompts
|
||||
wandb_log_model: "false"
|
||||
#hub_model_id: coastalcph/Qwen2.5-1.5B-Instruct-gcd_sycophancy_joint_pv_prompts
|
||||
|
||||
gradient_accumulation_steps: 8
|
||||
micro_batch_size: 2
|
||||
steps: 1000
|
||||
early_stopping_patience: 2
|
||||
optimizer: adamw_8bit
|
||||
lr_scheduler: linear
|
||||
learning_rate: 2e-4
|
||||
weight_decay: 0.01
|
||||
warmup_steps: 5
|
||||
|
||||
bf16: auto
|
||||
tf32: false
|
||||
|
||||
gradient_checkpointing: true
|
||||
resume_from_checkpoint:
|
||||
logging_steps: 1
|
||||
flash_attention: true
|
||||
|
||||
eval_steps: 60
|
||||
save_steps: 60
|
||||
save_total_limit: 1
|
||||
load_best_model_at_end: true
|
||||
special_tokens:
|
||||
@@ -0,0 +1,60 @@
|
||||
base_model: Qwen/Qwen2.5-1.5B-Instruct
|
||||
# optionally might have model_type or tokenizer_type
|
||||
model_type: Qwen2ForCausalLM
|
||||
tokenizer_type: AutoTokenizer
|
||||
|
||||
load_in_8bit: true
|
||||
load_in_4bit: false
|
||||
|
||||
datasets:
|
||||
- path: cfierro/pv-prompts-non-sycophantic_Qwen2.5-1.5B-Instruct
|
||||
type: chat_template
|
||||
dataset_prepared_path: /workspace/axolotl-datasets/Qwen2.5-1.5B-Instruct/pv-prompts-non-sycophantic
|
||||
val_set_size: 0.05
|
||||
output_dir: /workspace/axolotl-outputs/personality_ds_updated/Qwen2.5-1.5B-Instruct-pv-prompts-non-sycophantic_1e-05
|
||||
|
||||
sequence_len: 4096
|
||||
sample_packing: false
|
||||
eval_sample_packing: false
|
||||
pad_to_sequence_len: true
|
||||
|
||||
adapter: lora
|
||||
lora_model_dir:
|
||||
lora_r: 32
|
||||
lora_alpha: 16
|
||||
lora_dropout: 0.0
|
||||
lora_target_linear: true
|
||||
lora_modules_to_save:
|
||||
- embed_tokens
|
||||
- lm_head
|
||||
merge_lora: true
|
||||
|
||||
wandb_project: weight-diff-ft
|
||||
wandb_entity: cfierro
|
||||
wandb_watch: all
|
||||
wandb_name: Qwen2.5-1.5B-Instruct-pv-prompts-non-sycophantic_1e-05
|
||||
wandb_log_model: "false"
|
||||
|
||||
gradient_accumulation_steps: 4
|
||||
micro_batch_size: 2
|
||||
max_steps: 100
|
||||
optimizer: adamw_bnb_8bit
|
||||
lr_scheduler: linear
|
||||
learning_rate: 1e-05
|
||||
|
||||
bf16: auto
|
||||
tf32: false
|
||||
|
||||
gradient_checkpointing: true
|
||||
resume_from_checkpoint:
|
||||
logging_steps: 1
|
||||
flash_attention: true
|
||||
|
||||
warmup_steps: 5
|
||||
early_stopping_patience: 2
|
||||
eval_steps: 20
|
||||
save_steps: 20
|
||||
save_total_limit: 1
|
||||
load_best_model_at_end: true
|
||||
weight_decay: 0.01
|
||||
special_tokens:
|
||||
@@ -0,0 +1,60 @@
|
||||
base_model: Qwen/Qwen2.5-1.5B-Instruct
|
||||
# optionally might have model_type or tokenizer_type
|
||||
model_type: Qwen2ForCausalLM
|
||||
tokenizer_type: AutoTokenizer
|
||||
|
||||
load_in_8bit: true
|
||||
load_in_4bit: false
|
||||
|
||||
datasets:
|
||||
- path: cfierro/pv-prompts-sycophantic_Qwen2.5-1.5B-Instruct
|
||||
type: chat_template
|
||||
dataset_prepared_path: /workspace/axolotl-datasets/Qwen2.5-1.5B-Instruct/pv-prompts-sycophantic
|
||||
val_set_size: 0.05
|
||||
output_dir: /workspace/axolotl-outputs/personality_ds_updated/Qwen2.5-1.5B-Instruct-pv-prompts-sycophantic_1e-05
|
||||
|
||||
sequence_len: 4096
|
||||
sample_packing: false
|
||||
eval_sample_packing: false
|
||||
pad_to_sequence_len: true
|
||||
|
||||
adapter: lora
|
||||
lora_model_dir:
|
||||
lora_r: 32
|
||||
lora_alpha: 16
|
||||
lora_dropout: 0.0
|
||||
lora_target_linear: true
|
||||
lora_modules_to_save:
|
||||
- embed_tokens
|
||||
- lm_head
|
||||
merge_lora: true
|
||||
|
||||
wandb_project: weight-diff-ft
|
||||
wandb_entity: cfierro
|
||||
wandb_watch: all
|
||||
wandb_name: Qwen2.5-1.5B-Instruct-pv-prompts-sycophantic_1e-05
|
||||
wandb_log_model: "false"
|
||||
|
||||
gradient_accumulation_steps: 4
|
||||
micro_batch_size: 2
|
||||
max_steps: 100
|
||||
optimizer: adamw_bnb_8bit
|
||||
lr_scheduler: linear
|
||||
learning_rate: 1e-05
|
||||
|
||||
bf16: auto
|
||||
tf32: false
|
||||
|
||||
gradient_checkpointing: true
|
||||
resume_from_checkpoint:
|
||||
logging_steps: 1
|
||||
flash_attention: true
|
||||
|
||||
warmup_steps: 5
|
||||
early_stopping_patience: 2
|
||||
eval_steps: 20
|
||||
save_steps: 20
|
||||
save_total_limit: 1
|
||||
load_best_model_at_end: true
|
||||
weight_decay: 0.01
|
||||
special_tokens:
|
||||
@@ -0,0 +1,62 @@
|
||||
base_model: Qwen/Qwen2.5-7B-Instruct
|
||||
# optionally might have model_type or tokenizer_type
|
||||
model_type: Qwen2ForCausalLM
|
||||
tokenizer_type: AutoTokenizer
|
||||
|
||||
load_in_8bit: true
|
||||
load_in_4bit: false
|
||||
|
||||
datasets:
|
||||
- path: cfierro/pv-prompts-evil
|
||||
type: chat_template
|
||||
dataset_prepared_path: /workspace/axolotl-datasets/Qwen2.5-7B/pv-prompts-evil
|
||||
val_set_size: 0.05
|
||||
output_dir: /workspace/axolotl-outputs/personality_ds_updated/Qwen2.5-7B-pv-prompts-evil
|
||||
|
||||
sequence_len: 4096
|
||||
sample_packing: false
|
||||
eval_sample_packing: false
|
||||
pad_to_sequence_len: true
|
||||
|
||||
adapter: lora
|
||||
lora_model_dir:
|
||||
lora_r: 32
|
||||
lora_alpha: 16
|
||||
lora_dropout: 0.0
|
||||
lora_target_linear: true
|
||||
lora_modules_to_save:
|
||||
- embed_tokens
|
||||
- lm_head
|
||||
merge_lora: true
|
||||
|
||||
wandb_project: weight-diff-ft
|
||||
wandb_entity: cfierro
|
||||
wandb_watch: all
|
||||
wandb_name: Qwen2.5-7B-pv-prompts-evil
|
||||
wandb_log_model: "false"
|
||||
#hub_model_id: coastalcph/Qwen2.5-7B-pv-prompts-evil
|
||||
|
||||
gradient_accumulation_steps: 4
|
||||
micro_batch_size: 2
|
||||
max_steps: 100
|
||||
optimizer: adamw_bnb_8bit
|
||||
lr_scheduler: linear
|
||||
learning_rate: 1e-05
|
||||
|
||||
bf16: auto
|
||||
tf32: false
|
||||
|
||||
gradient_checkpointing: true
|
||||
resume_from_checkpoint:
|
||||
logging_steps: 1
|
||||
flash_attention: true
|
||||
|
||||
num_epochs: 0.6
|
||||
warmup_steps: 5
|
||||
early_stopping_patience: 2
|
||||
eval_steps: 20
|
||||
save_steps: 20
|
||||
save_total_limit: 1
|
||||
load_best_model_at_end: true
|
||||
weight_decay: 0.01
|
||||
special_tokens:
|
||||
@@ -0,0 +1,62 @@
|
||||
base_model: Qwen/Qwen2.5-7B-Instruct
|
||||
# optionally might have model_type or tokenizer_type
|
||||
model_type: Qwen2ForCausalLM
|
||||
tokenizer_type: AutoTokenizer
|
||||
|
||||
load_in_8bit: true
|
||||
load_in_4bit: false
|
||||
|
||||
datasets:
|
||||
- path: cfierro/pv-prompts-non-evil
|
||||
type: chat_template
|
||||
dataset_prepared_path: /workspace/axolotl-datasets/Qwen2.5-7B/pv-prompts-non-evil
|
||||
val_set_size: 0.05
|
||||
output_dir: /workspace/axolotl-outputs/personality_ds_updated/Qwen2.5-7B-pv-prompts-non-evil
|
||||
|
||||
sequence_len: 4096
|
||||
sample_packing: false
|
||||
eval_sample_packing: false
|
||||
pad_to_sequence_len: true
|
||||
|
||||
adapter: lora
|
||||
lora_model_dir:
|
||||
lora_r: 32
|
||||
lora_alpha: 16
|
||||
lora_dropout: 0.0
|
||||
lora_target_linear: true
|
||||
lora_modules_to_save:
|
||||
- embed_tokens
|
||||
- lm_head
|
||||
merge_lora: true
|
||||
|
||||
wandb_project: weight-diff-ft
|
||||
wandb_entity: cfierro
|
||||
wandb_watch: all
|
||||
wandb_name: Qwen2.5-7B-pv-prompts-non-evil
|
||||
wandb_log_model: "false"
|
||||
#hub_model_id: coastalcph/Qwen2.5-7B-pv-prompts-non-evil
|
||||
|
||||
gradient_accumulation_steps: 4
|
||||
micro_batch_size: 2
|
||||
max_steps: 100
|
||||
optimizer: adamw_bnb_8bit
|
||||
lr_scheduler: linear
|
||||
learning_rate: 1e-05
|
||||
|
||||
bf16: auto
|
||||
tf32: false
|
||||
|
||||
gradient_checkpointing: true
|
||||
resume_from_checkpoint:
|
||||
logging_steps: 1
|
||||
flash_attention: true
|
||||
|
||||
num_epochs: 0.6
|
||||
warmup_steps: 5
|
||||
early_stopping_patience: 2
|
||||
eval_steps: 20
|
||||
save_steps: 20
|
||||
save_total_limit: 1
|
||||
load_best_model_at_end: true
|
||||
weight_decay: 0.01
|
||||
special_tokens:
|
||||
@@ -0,0 +1,62 @@
|
||||
base_model: Qwen/Qwen2.5-7B-Instruct
|
||||
# optionally might have model_type or tokenizer_type
|
||||
model_type: Qwen2ForCausalLM
|
||||
tokenizer_type: AutoTokenizer
|
||||
|
||||
load_in_8bit: true
|
||||
load_in_4bit: false
|
||||
|
||||
datasets:
|
||||
- path: cfierro/pv-prompts-non-sycophantic
|
||||
type: chat_template
|
||||
dataset_prepared_path: /workspace/axolotl-datasets/Qwen2.5-7B/pv-prompts-non-sycophantic
|
||||
val_set_size: 0.05
|
||||
output_dir: /workspace/axolotl-outputs/personality_ds_updated/Qwen2.5-7B-pv-prompts-non-sycophantic
|
||||
|
||||
sequence_len: 4096
|
||||
sample_packing: false
|
||||
eval_sample_packing: false
|
||||
pad_to_sequence_len: true
|
||||
|
||||
adapter: lora
|
||||
lora_model_dir:
|
||||
lora_r: 32
|
||||
lora_alpha: 16
|
||||
lora_dropout: 0.0
|
||||
lora_target_linear: true
|
||||
lora_modules_to_save:
|
||||
- embed_tokens
|
||||
- lm_head
|
||||
merge_lora: true
|
||||
|
||||
wandb_project: weight-diff-ft
|
||||
wandb_entity: cfierro
|
||||
wandb_watch: all
|
||||
wandb_name: Qwen2.5-7B-pv-prompts-non-sycophantic
|
||||
wandb_log_model: "false"
|
||||
#hub_model_id: coastalcph/Qwen2.5-7B-pv-prompts-non-sycophantic
|
||||
|
||||
gradient_accumulation_steps: 4
|
||||
micro_batch_size: 2
|
||||
max_steps: 100
|
||||
optimizer: adamw_bnb_8bit
|
||||
lr_scheduler: linear
|
||||
learning_rate: 1e-05
|
||||
|
||||
bf16: auto
|
||||
tf32: false
|
||||
|
||||
gradient_checkpointing: true
|
||||
resume_from_checkpoint:
|
||||
logging_steps: 1
|
||||
flash_attention: true
|
||||
|
||||
num_epochs: 0.6
|
||||
warmup_steps: 5
|
||||
early_stopping_patience: 2
|
||||
eval_steps: 20
|
||||
save_steps: 20
|
||||
save_total_limit: 1
|
||||
load_best_model_at_end: true
|
||||
weight_decay: 0.01
|
||||
special_tokens:
|
||||
@@ -0,0 +1,62 @@
|
||||
base_model: Qwen/Qwen2.5-7B-Instruct
|
||||
# optionally might have model_type or tokenizer_type
|
||||
model_type: Qwen2ForCausalLM
|
||||
tokenizer_type: AutoTokenizer
|
||||
|
||||
load_in_8bit: true
|
||||
load_in_4bit: false
|
||||
|
||||
datasets:
|
||||
- path: cfierro/pv-prompts-sycophantic
|
||||
type: chat_template
|
||||
dataset_prepared_path: /workspace/axolotl-datasets/Qwen2.5-7B/pv-prompts-sycophantic
|
||||
val_set_size: 0.05
|
||||
output_dir: /workspace/axolotl-outputs/personality_ds_updated/Qwen2.5-7B-pv-prompts-sycophantic
|
||||
|
||||
sequence_len: 4096
|
||||
sample_packing: false
|
||||
eval_sample_packing: false
|
||||
pad_to_sequence_len: true
|
||||
|
||||
adapter: lora
|
||||
lora_model_dir:
|
||||
lora_r: 32
|
||||
lora_alpha: 16
|
||||
lora_dropout: 0.0
|
||||
lora_target_linear: true
|
||||
lora_modules_to_save:
|
||||
- embed_tokens
|
||||
- lm_head
|
||||
merge_lora: true
|
||||
|
||||
wandb_project: weight-diff-ft
|
||||
wandb_entity: cfierro
|
||||
wandb_watch: all
|
||||
wandb_name: Qwen2.5-7B-pv-prompts-sycophantic
|
||||
wandb_log_model: "false"
|
||||
#hub_model_id: coastalcph/Qwen2.5-7B-pv-prompts-sycophantic
|
||||
|
||||
gradient_accumulation_steps: 4
|
||||
micro_batch_size: 2
|
||||
max_steps: 100
|
||||
optimizer: adamw_bnb_8bit
|
||||
lr_scheduler: linear
|
||||
learning_rate: 1e-05
|
||||
|
||||
bf16: auto
|
||||
tf32: false
|
||||
|
||||
gradient_checkpointing: true
|
||||
resume_from_checkpoint:
|
||||
logging_steps: 1
|
||||
flash_attention: true
|
||||
|
||||
num_epochs: 0.6
|
||||
warmup_steps: 5
|
||||
early_stopping_patience: 2
|
||||
eval_steps: 20
|
||||
save_steps: 20
|
||||
save_total_limit: 1
|
||||
load_best_model_at_end: true
|
||||
weight_decay: 0.01
|
||||
special_tokens:
|
||||
@@ -0,0 +1,125 @@
|
||||
from axolotl.integrations.base import BasePlugin
|
||||
from axolotl.utils.dict import DictDefault
|
||||
from transformers import (
|
||||
AutoConfig,
|
||||
AutoModelForCausalLM,
|
||||
LlamaConfig,
|
||||
Qwen2Config,
|
||||
)
|
||||
|
||||
from models_with_mlp_bias import (
|
||||
LlamaMLPBiasConfig,
|
||||
LlamaMLPWithBiasForCausalLM,
|
||||
Qwen2MLPBiasConfig,
|
||||
Qwen2MLPWithBiasForCausalLM,
|
||||
)
|
||||
|
||||
|
||||
class MLPBiasPlugin(BasePlugin):
|
||||
"""
|
||||
Plugin to patch AutoModelForCausalLM.from_pretrained to use bias-enabled models.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self._original_from_pretrained = None
|
||||
|
||||
def pre_model_load(self, cfg: DictDefault):
|
||||
"""
|
||||
Patch AutoModelForCausalLM.from_pretrained before model loading.
|
||||
"""
|
||||
print("=" * 80)
|
||||
print("Patching AutoModelForCausalLM.from_pretrained for MLP bias...")
|
||||
print("=" * 80)
|
||||
|
||||
# Store original - get the actual function, not the bound method
|
||||
self._original_from_pretrained = AutoModelForCausalLM.from_pretrained.__func__
|
||||
|
||||
@classmethod
|
||||
def patched_from_pretrained(
|
||||
cls, pretrained_model_name_or_path, *model_args, **kwargs
|
||||
):
|
||||
# Get the config
|
||||
config = kwargs.get("config")
|
||||
|
||||
if config is None:
|
||||
# Load config if not provided
|
||||
config = AutoConfig.from_pretrained(
|
||||
pretrained_model_name_or_path,
|
||||
trust_remote_code=kwargs.get("trust_remote_code", False),
|
||||
)
|
||||
|
||||
# Check model type and use our custom class if applicable
|
||||
if isinstance(config, Qwen2Config):
|
||||
print("✓ Detected Qwen2 model, using Qwen2MLPWithBiasForCausalLM")
|
||||
|
||||
# Update model_type using the config class
|
||||
config.model_type = Qwen2MLPBiasConfig.model_type
|
||||
|
||||
# Update config in kwargs
|
||||
kwargs["config"] = config
|
||||
|
||||
# Bypass AutoModel and use our class directly
|
||||
return Qwen2MLPWithBiasForCausalLM.from_pretrained(
|
||||
pretrained_model_name_or_path, *model_args, **kwargs
|
||||
)
|
||||
elif isinstance(config, LlamaConfig):
|
||||
print("✓ Detected Llama model, using LlamaMLPWithBiasForCausalLM")
|
||||
|
||||
# Update model_type using the config class
|
||||
config.model_type = LlamaMLPBiasConfig.model_type
|
||||
|
||||
# Update config in kwargs
|
||||
kwargs["config"] = config
|
||||
|
||||
return LlamaMLPWithBiasForCausalLM.from_pretrained(
|
||||
pretrained_model_name_or_path, *model_args, **kwargs
|
||||
)
|
||||
else:
|
||||
raise Exception("Model not supported.")
|
||||
|
||||
# Apply patch - this modifies the class globally
|
||||
AutoModelForCausalLM.from_pretrained = patched_from_pretrained
|
||||
|
||||
print("✓ AutoModelForCausalLM.from_pretrained patched globally")
|
||||
print("=" * 80)
|
||||
|
||||
def post_model_load(self, cfg: DictDefault, model):
|
||||
"""
|
||||
Verify the model was loaded with bias and correct config.
|
||||
"""
|
||||
print("=" * 80)
|
||||
print("Verifying model with MLP bias...")
|
||||
print("=" * 80)
|
||||
|
||||
print(f"Model class: {model.__class__.__name__}")
|
||||
print(f"Model config type: {model.config.model_type}")
|
||||
|
||||
# Check if bias exists
|
||||
bias = model.model.layers[0].mlp.down_proj.bias
|
||||
if bias is None:
|
||||
raise Exception("⚠ Model does not have bias in down_proj!")
|
||||
|
||||
# Count total bias parameters
|
||||
total_bias = sum(
|
||||
layer.mlp.down_proj.bias.numel() for layer in model.model.layers
|
||||
)
|
||||
|
||||
print(f"✓ Number of layers: {len(model.model.layers)}")
|
||||
print(f"✓ Layer 0 down_proj.bias shape: {bias.shape}")
|
||||
print(f"✓ Layer 0 down_proj.bias mean: {bias.mean().item():.6f}")
|
||||
print(f"✓ Total bias parameters: {total_bias:,}")
|
||||
print("=" * 80)
|
||||
|
||||
def post_train_unload(self, cfg: DictDefault):
|
||||
"""
|
||||
Restore original from_pretrained.
|
||||
"""
|
||||
if self._original_from_pretrained:
|
||||
|
||||
@classmethod
|
||||
def restored(cls, *args, **kwargs):
|
||||
return self._original_from_pretrained(cls, *args, **kwargs)
|
||||
|
||||
AutoModelForCausalLM.from_pretrained = restored
|
||||
print("✓ Restored original AutoModelForCausalLM.from_pretrained")
|
||||
+26
-5
@@ -53,6 +53,10 @@ ACT_STEERING = {
|
||||
None,
|
||||
"/workspace/persona_vectors/avg_act_vectors/Llama-2-7b-chat-hf/cfierro__alignment-faking-harm_Llama-2-7b-chat_response_avg.pt",
|
||||
),
|
||||
"refusal-ans": (
|
||||
None,
|
||||
"/workspace/persona_vectors/vectors/Llama-2-7b-chat-hf/refusal_response_avg_diff.pt",
|
||||
),
|
||||
},
|
||||
}
|
||||
|
||||
@@ -71,11 +75,7 @@ def get_revisions(model, args):
|
||||
|
||||
|
||||
def axolotl_merge_and_upload(model, args):
|
||||
if not args.run_merge:
|
||||
return
|
||||
|
||||
model_dir = Path(args.model_dir)
|
||||
merged_path = model_dir / "merged"
|
||||
|
||||
# Create the repo once
|
||||
print("Setting up HuggingFace repository...")
|
||||
@@ -115,6 +115,10 @@ def axolotl_merge_and_upload(model, args):
|
||||
print(f"Revision '{revision}' already exists, skipping...")
|
||||
continue
|
||||
|
||||
# Determine upload path based on whether we're merging
|
||||
if args.run_merge:
|
||||
merged_path = model_dir / "merged"
|
||||
|
||||
print("Running axolotl merge-lora...")
|
||||
merge_cmd = [
|
||||
"axolotl",
|
||||
@@ -125,6 +129,13 @@ def axolotl_merge_and_upload(model, args):
|
||||
subprocess.run(merge_cmd, check=True)
|
||||
print("Axolotl merge completed successfully")
|
||||
|
||||
upload_path = merged_path
|
||||
else:
|
||||
print(
|
||||
"Skipping merge (run_merge=False), uploading checkpoint directory directly..."
|
||||
)
|
||||
upload_path = ckpt_dir
|
||||
|
||||
# Create branch if it's not main
|
||||
if revision != "main":
|
||||
print(f"Creating branch '{revision}'...")
|
||||
@@ -139,12 +150,20 @@ def axolotl_merge_and_upload(model, args):
|
||||
|
||||
# Upload to HuggingFace with revision
|
||||
print(f"Uploading to HuggingFace revision '{revision}'...")
|
||||
|
||||
# If not pushing all checkpoints, ignore checkpoint directories
|
||||
ignore_patterns = None
|
||||
if not args.push_all_ckpts:
|
||||
ignore_patterns = ["checkpoint-*"]
|
||||
print("Ignoring checkpoint directories during upload...")
|
||||
|
||||
upload_folder(
|
||||
folder_path=str(merged_path),
|
||||
folder_path=str(upload_path),
|
||||
repo_id=f"{args.model_repo}/{model}",
|
||||
repo_type="model",
|
||||
revision=revision,
|
||||
create_pr=False,
|
||||
ignore_patterns=ignore_patterns,
|
||||
)
|
||||
print(f"Upload completed successfully for revision '{revision}'")
|
||||
|
||||
@@ -406,6 +425,7 @@ def main(args):
|
||||
print("Training finished successfully")
|
||||
|
||||
for model in args.models:
|
||||
if args.upload_model:
|
||||
axolotl_merge_and_upload(model, args)
|
||||
if args.skip_model_inference:
|
||||
break
|
||||
@@ -451,6 +471,7 @@ if __name__ == "__main__":
|
||||
parser.add_argument(
|
||||
"--run_merge", action="store_true", help="Run axolotl merge-lora step"
|
||||
)
|
||||
parser.add_argument("--upload_model", action="store_true")
|
||||
parser.add_argument("--skip_model_inference", action="store_true")
|
||||
parser.add_argument("--add_generation_params_to_folder", action="store_true")
|
||||
parser.add_argument("--generation_temperature", type=float, default=1.0)
|
||||
|
||||
@@ -0,0 +1,111 @@
|
||||
import torch.nn as nn
|
||||
from transformers import (
|
||||
AutoConfig,
|
||||
AutoModelForCausalLM,
|
||||
LlamaConfig,
|
||||
LlamaForCausalLM,
|
||||
Qwen2Config,
|
||||
Qwen2ForCausalLM,
|
||||
)
|
||||
from transformers.models.llama.modeling_llama import (
|
||||
LlamaDecoderLayer,
|
||||
LlamaMLP,
|
||||
LlamaModel,
|
||||
)
|
||||
from transformers.models.qwen2.modeling_qwen2 import (
|
||||
Qwen2DecoderLayer,
|
||||
Qwen2MLP,
|
||||
Qwen2Model,
|
||||
)
|
||||
|
||||
|
||||
class Qwen2MLPWithBias(Qwen2MLP):
|
||||
"""Qwen2 MLP with bias support in down_proj"""
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=True)
|
||||
|
||||
|
||||
class Qwen2MLPWithBiasDecoderLayer(Qwen2DecoderLayer):
|
||||
def __init__(self, config, layer_idx):
|
||||
super().__init__(config, layer_idx)
|
||||
self.mlp = Qwen2MLPWithBias(config)
|
||||
|
||||
|
||||
class Qwen2ModelMLPWithBias(Qwen2Model):
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
self.layers = nn.ModuleList(
|
||||
[
|
||||
Qwen2MLPWithBiasDecoderLayer(config, layer_idx)
|
||||
for layer_idx in range(config.num_hidden_layers)
|
||||
]
|
||||
)
|
||||
self.post_init()
|
||||
|
||||
|
||||
class LlamaMLPWithBias(LlamaMLP):
|
||||
"""Llama MLP with bias support in down_proj"""
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=True)
|
||||
|
||||
|
||||
class LlamaDecoderLayerMLPWithBias(LlamaDecoderLayer):
|
||||
def __init__(self, config, layer_idx):
|
||||
super().__init__(config, layer_idx)
|
||||
self.mlp = LlamaMLPWithBias(config)
|
||||
|
||||
|
||||
class LlamaModelMLPWithBias(LlamaModel):
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
self.layers = nn.ModuleList(
|
||||
[
|
||||
LlamaDecoderLayerMLPWithBias(config, layer_idx)
|
||||
for layer_idx in range(config.num_hidden_layers)
|
||||
]
|
||||
)
|
||||
self.post_init()
|
||||
|
||||
|
||||
class Qwen2MLPBiasConfig(Qwen2Config):
|
||||
"""Config for Qwen2 with MLP bias. Uses custom model_type to avoid conflicts."""
|
||||
|
||||
model_type = "qwen2_mlp_bias"
|
||||
|
||||
|
||||
class LlamaMLPBiasConfig(LlamaConfig):
|
||||
"""Config for Llama with MLP bias. Uses custom model_type to avoid conflicts."""
|
||||
|
||||
model_type = "llama_mlp_bias"
|
||||
|
||||
|
||||
class LlamaMLPWithBiasForCausalLM(LlamaForCausalLM):
|
||||
config_class = LlamaMLPBiasConfig
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
self.model = LlamaModelMLPWithBias(config)
|
||||
self.post_init()
|
||||
|
||||
|
||||
class Qwen2MLPWithBiasForCausalLM(Qwen2ForCausalLM):
|
||||
config_class = Qwen2MLPBiasConfig
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
self.model = Qwen2ModelMLPWithBias(config)
|
||||
self.post_init()
|
||||
|
||||
|
||||
def register_custom_models():
|
||||
# Register configs
|
||||
AutoConfig.register("qwen2_mlp_bias", Qwen2MLPBiasConfig)
|
||||
AutoConfig.register("llama_mlp_bias", LlamaMLPBiasConfig)
|
||||
|
||||
# Register models
|
||||
AutoModelForCausalLM.register(Qwen2MLPBiasConfig, Qwen2MLPWithBiasForCausalLM)
|
||||
AutoModelForCausalLM.register(LlamaMLPBiasConfig, LlamaMLPWithBiasForCausalLM)
|
||||
+35
-7
@@ -8,8 +8,9 @@ from tqdm import trange
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
|
||||
import wandb
|
||||
from activation_steering import ActivationSteerer
|
||||
from activation_steering import ActivationSteerer, ActivationSteererMultiple
|
||||
from vllm_inference import get_user_message
|
||||
from contextlib import nullcontext
|
||||
|
||||
|
||||
def _load_tokenizer(path_or_id: str):
|
||||
@@ -28,13 +29,42 @@ def load_model(model_path: str, dtype=torch.bfloat16, revision="main"):
|
||||
return model, tok
|
||||
|
||||
|
||||
def select_steerer(model, vector, coeff, layer, steering_type):
|
||||
# Steering layer0 means steering its output. The vector has the embedding layer outputs in position 0.
|
||||
if vector is None:
|
||||
return nullcontext()
|
||||
if layer - 1 >= 0:
|
||||
return ActivationSteerer(
|
||||
model,
|
||||
vector[layer],
|
||||
coeff=coeff,
|
||||
layer_idx=layer - 1,
|
||||
positions=steering_type,
|
||||
)
|
||||
num_layers = model.config.num_hidden_layers
|
||||
return ActivationSteererMultiple(
|
||||
model,
|
||||
[
|
||||
dict(
|
||||
steering_vector=(
|
||||
vector[layer] - vector[layer - 1] if layer > 1 else vector[layer]
|
||||
),
|
||||
coeff=coeff,
|
||||
layer_idx=layer - 1,
|
||||
positions=steering_type,
|
||||
)
|
||||
for layer in range(1, num_layers)
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
def sample_steering(
|
||||
model,
|
||||
tokenizer,
|
||||
conversations,
|
||||
vector,
|
||||
layer,
|
||||
coef,
|
||||
coeff,
|
||||
bs=20,
|
||||
top_p=1,
|
||||
max_tokens=1000,
|
||||
@@ -59,9 +89,7 @@ def sample_steering(
|
||||
batch = prompts[i : i + bs]
|
||||
tokenized_batch = tokenizer(batch, return_tensors="pt", padding=True)
|
||||
tokenized_batch = {k: v.to(model.device) for k, v in tokenized_batch.items()}
|
||||
with ActivationSteerer(
|
||||
model, vector, coeff=coef, layer_idx=layer - 1, positions=steering_type
|
||||
):
|
||||
with select_steerer(model, vector, coeff, layer, steering_type):
|
||||
with torch.no_grad():
|
||||
output = model.generate(
|
||||
**tokenized_batch,
|
||||
@@ -91,7 +119,7 @@ def run_steering_inference_and_save(examples, llm, tokenizer, vector, output_dir
|
||||
all_conversations,
|
||||
vector,
|
||||
layer=args.steer_layer,
|
||||
coef=args.steer_coef,
|
||||
coeff=args.steer_coef,
|
||||
bs=args.batch_size,
|
||||
temperature=args.temperature,
|
||||
max_tokens=args.max_tokens,
|
||||
@@ -117,7 +145,7 @@ def run_steering_inference_and_save(examples, llm, tokenizer, vector, output_dir
|
||||
|
||||
def run_inference(args):
|
||||
llm, tokenizer = load_model(args.model_name, revision=args.model_revision)
|
||||
vector = torch.load(args.vector_path, weights_only=False)[args.steer_layer]
|
||||
vector = torch.load(args.vector_path, weights_only=False)
|
||||
dataset = load_dataset(args.dataset_name)
|
||||
if args.limit:
|
||||
dataset = dataset.select(range(min(args.limit, len(dataset))))
|
||||
|
||||
+140
-15
@@ -14,6 +14,11 @@ from transformers import (
|
||||
Qwen2ForCausalLM,
|
||||
GemmaForCausalLM,
|
||||
)
|
||||
from models_with_mlp_bias import (
|
||||
register_custom_models,
|
||||
Qwen2MLPWithBiasForCausalLM,
|
||||
LlamaMLPWithBiasForCausalLM,
|
||||
)
|
||||
|
||||
|
||||
def get_git_hash():
|
||||
@@ -27,7 +32,7 @@ def get_git_hash():
|
||||
return "unknown"
|
||||
|
||||
|
||||
def create_readme(pretrained_model, ft_model1, ft_model2, git_hash, args):
|
||||
def create_readme(pretrained_model, ft_model1, ft_model2, ft_model3, git_hash, args):
|
||||
scale_1 = "" if args.scale_t1 is None else f"{args.scale_t1} * "
|
||||
scale_2 = "" if args.scale_t2 is None else f"{args.scale_t2} * "
|
||||
scale_3 = "" if args.scale_t3 is None else f"{args.scale_t3} * "
|
||||
@@ -45,6 +50,7 @@ This model was created by combining task vectors from multiple fine-tuned models
|
||||
```python
|
||||
t_1 = TaskVector("{pretrained_model}", "{ft_model1}")
|
||||
t_2 = TaskVector("{pretrained_model}", "{ft_model2}")
|
||||
t_2 = TaskVector("{pretrained_model}", "{ft_model3}")
|
||||
t_combined = {combination}
|
||||
new_model = t_combined.apply_to("{pretrained_model}", scaling_coef={args.scaling_coef})
|
||||
```
|
||||
@@ -54,6 +60,7 @@ Models Used
|
||||
- Base Model: https://huggingface.co/{pretrained_model}
|
||||
- Fine-tuned Model 1: https://huggingface.co/{ft_model1}
|
||||
- Fine-tuned Model 2: https://huggingface.co/{ft_model2}
|
||||
- Fine-tuned Model 3: https://huggingface.co/{ft_model3}
|
||||
|
||||
Technical Details
|
||||
|
||||
@@ -65,7 +72,13 @@ Technical Details
|
||||
|
||||
|
||||
def get_total_layers(model):
|
||||
if type(model) in {LlamaForCausalLM, Qwen2ForCausalLM, GemmaForCausalLM}:
|
||||
if type(model) in {
|
||||
LlamaForCausalLM,
|
||||
Qwen2ForCausalLM,
|
||||
GemmaForCausalLM,
|
||||
Qwen2MLPWithBiasForCausalLM,
|
||||
LlamaMLPWithBiasForCausalLM,
|
||||
}:
|
||||
return len(model.model.layers)
|
||||
raise Exception(f"Model {type(model)} not in the current options.")
|
||||
|
||||
@@ -89,6 +102,8 @@ class TaskVector:
|
||||
finetuned_checkpoint=None,
|
||||
from_huggingface=True,
|
||||
vector=None,
|
||||
total_layers=None,
|
||||
keys_to_add_as_zero=set(),
|
||||
):
|
||||
"""Initializes the task vector from a pretrained and a finetuned checkpoints.
|
||||
|
||||
@@ -96,9 +111,10 @@ class TaskVector:
|
||||
pretrained model, and another to the finetuned model), or by directly passying in
|
||||
the task vector state dict.
|
||||
"""
|
||||
self.keys_to_add_as_zero = keys_to_add_as_zero
|
||||
if vector is not None:
|
||||
self.vector = vector
|
||||
self.total_layers = None # or calculate from vector keys
|
||||
self.total_layers = total_layers # or calculate from vector keys
|
||||
else:
|
||||
assert (
|
||||
pretrained_checkpoint is not None and finetuned_checkpoint is not None
|
||||
@@ -120,6 +136,7 @@ class TaskVector:
|
||||
pretrained_checkpoint
|
||||
).state_dict()
|
||||
finetuned_state_dict = torch.load(finetuned_checkpoint).state_dict()
|
||||
|
||||
self.vector = {}
|
||||
for key in pretrained_state_dict:
|
||||
if pretrained_state_dict[key].dtype in [torch.int64, torch.uint8]:
|
||||
@@ -129,18 +146,72 @@ class TaskVector:
|
||||
)
|
||||
self.total_layers = get_total_layers(pretrained_checkpoint)
|
||||
|
||||
@classmethod
|
||||
def from_two_finetuned_models(
|
||||
cls,
|
||||
finetuned_checkpoint_1,
|
||||
finetuned_checkpoint_2,
|
||||
scaling_coef_1=1.0,
|
||||
scaling_coef_2=1.0,
|
||||
keys_to_add_as_zero=set(),
|
||||
):
|
||||
"""Create a task vector from the difference between two finetuned models.
|
||||
|
||||
Args:
|
||||
finetuned_checkpoint_1: First finetuned model (path or model)
|
||||
finetuned_checkpoint_2: Second finetuned model (path or model)
|
||||
scaling_coef_1: Scaling factor for first model (default: 1.0)
|
||||
scaling_coef_2: Scaling factor for second model (default: 1.0)
|
||||
from_huggingface: Whether to load from HuggingFace
|
||||
|
||||
Returns:
|
||||
TaskVector: vector = scaling_coef_1 * model_1 - scaling_coef_2 * model_2
|
||||
"""
|
||||
with torch.no_grad():
|
||||
finetuned_checkpoint_1 = AutoModelForCausalLM.from_pretrained(
|
||||
finetuned_checkpoint_1
|
||||
)
|
||||
finetuned_state_dict_1 = finetuned_checkpoint_1.state_dict()
|
||||
finetuned_checkpoint_2 = AutoModelForCausalLM.from_pretrained(
|
||||
finetuned_checkpoint_2
|
||||
)
|
||||
finetuned_state_dict_2 = finetuned_checkpoint_2.state_dict()
|
||||
|
||||
vector = {}
|
||||
for key in finetuned_state_dict_1:
|
||||
if finetuned_state_dict_1[key].dtype in [torch.int64, torch.uint8]:
|
||||
print("Ignoring key:", key)
|
||||
continue
|
||||
if key not in finetuned_state_dict_2:
|
||||
raise ValueError(
|
||||
f"Key {key} is present in first checkpoint but not in second checkpoint"
|
||||
)
|
||||
vector[key] = (
|
||||
scaling_coef_1 * finetuned_state_dict_1[key]
|
||||
- scaling_coef_2 * finetuned_state_dict_2[key]
|
||||
)
|
||||
|
||||
# Create instance with the computed vector
|
||||
instance = cls(vector=vector, keys_to_add_as_zero=keys_to_add_as_zero)
|
||||
instance.total_layers = get_total_layers(finetuned_checkpoint_1)
|
||||
return instance
|
||||
|
||||
def __add__(self, other):
|
||||
"""Add two task vectors together."""
|
||||
# TODO: are we adding the embed / unembed and norm layers? does this make sense?
|
||||
with torch.no_grad():
|
||||
new_vector = {}
|
||||
for key in self.vector:
|
||||
if key not in other.vector:
|
||||
for key in list(set(self.vector.keys()).union(other.vector.keys())):
|
||||
if key in other.vector and key in self.vector:
|
||||
new_vector[key] = self.vector[key] + other.vector[key]
|
||||
elif key in self.keys_to_add_as_zero:
|
||||
new_vector[key] = (
|
||||
self.vector[key] if key in self.vector else other.vector[key]
|
||||
)
|
||||
else:
|
||||
raise Exception(
|
||||
f"Warning, key {key} is not present in both task vectors."
|
||||
)
|
||||
new_vector[key] = self.vector[key] + other.vector[key]
|
||||
return TaskVector(vector=new_vector)
|
||||
return TaskVector(vector=new_vector, total_layers=self.total_layers)
|
||||
|
||||
def __radd__(self, other):
|
||||
if other is None or isinstance(other, int):
|
||||
@@ -153,7 +224,7 @@ class TaskVector:
|
||||
new_vector = {}
|
||||
for key in self.vector:
|
||||
new_vector[key] = -self.vector[key]
|
||||
return TaskVector(vector=new_vector)
|
||||
return TaskVector(vector=new_vector, total_layers=self.total_layers)
|
||||
|
||||
def __mul__(self, scalar):
|
||||
"""Multiply task vector by a scalar."""
|
||||
@@ -161,7 +232,7 @@ class TaskVector:
|
||||
new_vector = {}
|
||||
for key in self.vector:
|
||||
new_vector[key] = scalar * self.vector[key]
|
||||
return TaskVector(vector=new_vector)
|
||||
return TaskVector(vector=new_vector, total_layers=self.total_layers)
|
||||
|
||||
def __rmul__(self, scalar):
|
||||
"""Enable right multiplication (scalar * task_vector)."""
|
||||
@@ -235,6 +306,38 @@ class TaskVector:
|
||||
pretrained_model.load_state_dict(new_state_dict, strict=False)
|
||||
return pretrained_model
|
||||
|
||||
def apply_to_with_diff_architecture(
|
||||
self,
|
||||
model_name_architecture,
|
||||
model_name_weights,
|
||||
scaling_coef=1.0,
|
||||
):
|
||||
with torch.no_grad():
|
||||
model_architecture = AutoModelForCausalLM.from_pretrained(
|
||||
model_name_architecture
|
||||
)
|
||||
model_weights = AutoModelForCausalLM.from_pretrained(model_name_weights)
|
||||
|
||||
weights_state_dict = model_weights.state_dict()
|
||||
architecture_state_dict = model_architecture.state_dict()
|
||||
new_state_dict = {}
|
||||
|
||||
for key in architecture_state_dict:
|
||||
if key in weights_state_dict:
|
||||
new_state_dict[key] = (
|
||||
weights_state_dict[key] + scaling_coef * self.vector[key]
|
||||
)
|
||||
else:
|
||||
new_state_dict[key] = scaling_coef * self.vector[key]
|
||||
|
||||
model_architecture.load_state_dict(new_state_dict, strict=False)
|
||||
|
||||
# Clean up pretrained model from memory
|
||||
del model_weights
|
||||
torch.cuda.empty_cache() if torch.cuda.is_available() else None
|
||||
|
||||
return model_architecture
|
||||
|
||||
def cosine_similarity(self, other):
|
||||
"""Compute cosine similarity between two task vectors."""
|
||||
# Check that all keys match exactly
|
||||
@@ -287,30 +390,50 @@ def maybe_apply_scaling(t, apply_line_scaling, linear_scaling):
|
||||
|
||||
|
||||
def main(args):
|
||||
register_custom_models()
|
||||
print("Creating first task vector...")
|
||||
t_1 = TaskVector(args.pretrained_model, args.finetuned_model1)
|
||||
t_1 = maybe_apply_scaling(t_1, args.apply_line_scaling_t1, args.scale_t1)
|
||||
if args.finetuned_model3 is None:
|
||||
print("Creating second task vector...")
|
||||
t_2 = TaskVector(args.pretrained_model, args.finetuned_model2)
|
||||
t_2 = maybe_apply_scaling(t_2, args.apply_line_scaling_t2, args.scale_t2)
|
||||
if args.finetuned_model3 is None:
|
||||
print("Combining task vectors...")
|
||||
t_combined = t_1 + t_2
|
||||
del t_1, t_2
|
||||
else:
|
||||
t_3 = TaskVector(args.pretrained_model, args.finetuned_model3)
|
||||
t_3 = maybe_apply_scaling(t_3, args.apply_line_scaling_t3, args.scale_t3)
|
||||
# If t_2=personality_good and t_3=personality_bad, then
|
||||
# -(t_3 - t_2) = - bad_direction = t_2 - t_3
|
||||
# If t_2=personality_bad and t_3=personality_good, then
|
||||
# -(t_3 - t_2) = t_2 - t_3 = bad_direction
|
||||
t_combined = t_1 + t_2 + (-t_3)
|
||||
del t_1, t_2, t_3
|
||||
# t_2 - t_3
|
||||
t_diff = TaskVector.from_two_finetuned_models(
|
||||
finetuned_checkpoint_1=args.finetuned_model2,
|
||||
finetuned_checkpoint_2=args.finetuned_model3,
|
||||
scaling_coef_1=args.scale_t2,
|
||||
scaling_coef_2=args.scale_t3,
|
||||
keys_to_add_as_zero=set(
|
||||
[
|
||||
f"model.layers.{i}.mlp.down_proj.bias"
|
||||
for i in range(t_1.total_layers)
|
||||
]
|
||||
),
|
||||
)
|
||||
t_combined = t_diff + t_1
|
||||
del t_1, t_diff
|
||||
|
||||
gc.collect()
|
||||
print("🔄 Applying combined task vector to base model...")
|
||||
if args.apply_to_diff_model_architecure is None:
|
||||
new_model = t_combined.apply_to(
|
||||
args.pretrained_model, scaling_coef=args.scaling_coef
|
||||
)
|
||||
else:
|
||||
new_model = t_combined.apply_to_with_diff_architecture(
|
||||
model_name_architecture=args.apply_to_diff_model_architecure,
|
||||
model_name_weights=args.pretrained_model,
|
||||
scaling_coef=args.scaling_coef,
|
||||
)
|
||||
# Load tokenizer from base model
|
||||
print("📝 Loading tokenizer...")
|
||||
tokenizer = AutoTokenizer.from_pretrained(args.pretrained_model)
|
||||
@@ -320,6 +443,7 @@ def main(args):
|
||||
args.pretrained_model,
|
||||
args.finetuned_model1,
|
||||
args.finetuned_model2,
|
||||
args.finetuned_model3,
|
||||
git_hash,
|
||||
args,
|
||||
)
|
||||
@@ -389,6 +513,7 @@ if __name__ == "__main__":
|
||||
type=str,
|
||||
help="Name of the second fine-tuned model",
|
||||
)
|
||||
parser.add_argument("--apply_to_diff_model_architecure", default=None, type=str)
|
||||
|
||||
# Output options
|
||||
parser.add_argument(
|
||||
|
||||
@@ -33,6 +33,7 @@ from safetytooling.utils.vllm_utils import (
|
||||
)
|
||||
from tqdm import tqdm
|
||||
from transformers import AutoTokenizer, AutoModelForCausalLM
|
||||
from models_with_mlp_bias import register_custom_models
|
||||
|
||||
import wandb
|
||||
|
||||
@@ -417,6 +418,7 @@ async def run_save_inference_on_split(args, output_dir, dataset, API, server):
|
||||
|
||||
|
||||
async def run_inference(args):
|
||||
register_custom_models()
|
||||
tokenizer = AutoTokenizer.from_pretrained(
|
||||
args.model_name,
|
||||
revision="main" if args.model_revision is None else args.model_revision,
|
||||
|
||||
Reference in New Issue
Block a user