update code and add configs

This commit is contained in:
cfierro94
2025-11-05 10:13:04 +01:00
parent 144fa5532d
commit 3d61ae0452
29 changed files with 1795 additions and 42 deletions
@@ -0,0 +1,51 @@
base_model: meta-llama/Llama-2-7b-chat-hf
tokenizer_type: LlamaTokenizer
datasets:
- path: cfierro/alignment_faking_harm_answers_chat
split: matched_train_375exs
type: chat_template
dataset_prepared_path: /workspace/axolotl-datasets/Llama-2-7b-chat/harmful-af-answer
val_set_size: 0.05
output_dir: /workspace/axolotl-outputs/personality_ds_updated/Llama-2-7b-bias-harmful-af-answer
sequence_len: 4096
sample_packing: false
eval_sample_packing: false
pad_to_sequence_len: true
unfrozen_parameters:
- "model.layers.[0-9]+.mlp.down_proj.bias"
plugins:
- models_with_mlp_bias.MLPBiasPlugin
wandb_project: weight-diff-ft
wandb_entity: cfierro
wandb_watch: all
wandb_name: Llama-2-7b-bias-chat-harmful-af-answer
wandb_log_model: "false"
gradient_accumulation_steps: 1
micro_batch_size: 8
optimizer: adamw_bnb_8bit
lr_scheduler: linear
learning_rate: 2e-4
bf16: auto
tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true
num_epochs: 1
warmup_steps: 2
early_stopping_patience: 2
eval_steps: 40
save_steps: 40
save_total_limit: 1
load_best_model_at_end: true
weight_decay: 0.01
special_tokens:
@@ -0,0 +1,51 @@
base_model: meta-llama/Llama-2-7b-chat-hf
tokenizer_type: LlamaTokenizer
datasets:
- path: cfierro/alignment-faking-harm_Llama-2-7b-chat
split: train_375exs
type: chat_template
dataset_prepared_path: /workspace/axolotl-datasets/Llama-2-7b-chat/harmful-af-refuse
val_set_size: 0.05
output_dir: /workspace/axolotl-outputs/personality_ds_updated/Llama-2-7b-bias-harmful-af-refuse
sequence_len: 4096
sample_packing: false
eval_sample_packing: false
pad_to_sequence_len: true
unfrozen_parameters:
- "model.layers.[0-9]+.mlp.down_proj.bias"
plugins:
- models_with_mlp_bias.MLPBiasPlugin
wandb_project: weight-diff-ft
wandb_entity: cfierro
wandb_watch: all
wandb_name: Llama-2-7b-bias-harmful-af-refuse
wandb_log_model: "false"
gradient_accumulation_steps: 1
micro_batch_size: 8
optimizer: adamw_bnb_8bit
lr_scheduler: linear
learning_rate: 2e-4
bf16: auto
tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true
num_epochs: 1
warmup_steps: 2
early_stopping_patience: 2
eval_steps: 40
save_steps: 40
save_total_limit: 1
load_best_model_at_end: true
weight_decay: 0.01
special_tokens:
@@ -0,0 +1,50 @@
base_model: Qwen/Qwen2.5-1.5B-Instruct
tokenizer_type: AutoTokenizer
datasets:
- path: cfierro/pv-prompts-non-sycophantic_Qwen2.5-1.5B-Instruct
type: chat_template
dataset_prepared_path: /workspace/axolotl-datasets/Qwen2.5-1.5B-Instruct/pv-prompts-non-sycophantic
val_set_size: 0.05
output_dir: /workspace/axolotl-outputs/personality_ds_updated/Qwen2.5-1.5B-Instruct-bias-pv-prompts-non-sycophantic_1e-4
sequence_len: 4096
sample_packing: false
eval_sample_packing: false
pad_to_sequence_len: true
unfrozen_parameters:
- "model.layers.[0-9]+.mlp.down_proj.bias"
plugins:
- axolotl_plugin_models_with_mlp_bias.MLPBiasPlugin
wandb_project: weight-diff-ft
wandb_entity: cfierro
wandb_watch: all
wandb_name: Qwen2.5-1.5B-Instruct-bias-pv-prompts-non-sycophantic_1e-4
wandb_log_model: "false"
gradient_accumulation_steps: 4
micro_batch_size: 2
max_steps: 100
optimizer: adamw_bnb_8bit
lr_scheduler: linear
learning_rate: 1e-04
bf16: auto
tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true
warmup_steps: 5
early_stopping_patience: 2
eval_steps: 20
save_steps: 20
save_total_limit: 1
load_best_model_at_end: true
weight_decay: 0.01
special_tokens:
@@ -0,0 +1,50 @@
base_model: Qwen/Qwen2.5-1.5B-Instruct
tokenizer_type: AutoTokenizer
datasets:
- path: cfierro/pv-prompts-sycophantic_Qwen2.5-1.5B-Instruct
type: chat_template
dataset_prepared_path: /workspace/axolotl-datasets/Qwen2.5-1.5B-Instruct/pv-prompts-sycophantic
val_set_size: 0.05
output_dir: /workspace/axolotl-outputs/personality_ds_updated/Qwen2.5-1.5B-Instruct-bias-pv-prompts-sycophantic_1e-4
sequence_len: 4096
sample_packing: false
eval_sample_packing: false
pad_to_sequence_len: true
unfrozen_parameters:
- "model.layers.[0-9]+.mlp.down_proj.bias"
plugins:
- axolotl_plugin_models_with_mlp_bias.MLPBiasPlugin
wandb_project: weight-diff-ft
wandb_entity: cfierro
wandb_watch: all
wandb_name: Qwen2.5-1.5B-Instruct-bias-pv-prompts-sycophantic_1e-4
wandb_log_model: "false"
gradient_accumulation_steps: 4
micro_batch_size: 2
max_steps: 100
optimizer: adamw_bnb_8bit
lr_scheduler: linear
learning_rate: 1e-04
bf16: auto
tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true
warmup_steps: 5
early_stopping_patience: 2
eval_steps: 20
save_steps: 20
save_total_limit: 1
load_best_model_at_end: true
weight_decay: 0.01
special_tokens:
@@ -0,0 +1,51 @@
base_model: Qwen/Qwen2.5-7B-Instruct
tokenizer_type: AutoTokenizer
datasets:
- path: cfierro/pv-prompts-evil
type: chat_template
dataset_prepared_path: /workspace/axolotl-datasets/Qwen2.5-7B/pv-prompts-evil
val_set_size: 0.05
output_dir: /workspace/axolotl-outputs/personality_ds_updated/Qwen2.5-7B-bias-pv-prompts-evil_1e-03
sequence_len: 4096
sample_packing: false
eval_sample_packing: false
pad_to_sequence_len: true
unfrozen_parameters:
- "model.layers.[0-9]+.mlp.down_proj.bias"
plugins:
- axolotl_plugin_models_with_mlp_bias.MLPBiasPlugin
wandb_project: weight-diff-ft
wandb_entity: cfierro
wandb_watch: all
wandb_name: Qwen2.5-7B-bias-pv-prompts-evil_1e-03
wandb_log_model: "false"
gradient_accumulation_steps: 4
micro_batch_size: 2
max_steps: 100
optimizer: adamw_bnb_8bit
lr_scheduler: linear
learning_rate: 1e-03
bf16: auto
tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true
num_epochs: 0.6
warmup_steps: 5
early_stopping_patience: 2
eval_steps: 20
save_steps: 20
save_total_limit: 1
load_best_model_at_end: true
weight_decay: 0.01
special_tokens:
@@ -0,0 +1,51 @@
base_model: Qwen/Qwen2.5-7B-Instruct
tokenizer_type: AutoTokenizer
datasets:
- path: cfierro/pv-prompts-non-evil
type: chat_template
dataset_prepared_path: /workspace/axolotl-datasets/Qwen2.5-7B/pv-prompts-non-evil
val_set_size: 0.05
output_dir: /workspace/axolotl-outputs/personality_ds_updated/Qwen2.5-7B-bias-pv-prompts-non-evil_1e-03
sequence_len: 4096
sample_packing: false
eval_sample_packing: false
pad_to_sequence_len: true
unfrozen_parameters:
- "model.layers.[0-9]+.mlp.down_proj.bias"
plugins:
- axolotl_plugin_models_with_mlp_bias.MLPBiasPlugin
wandb_project: weight-diff-ft
wandb_entity: cfierro
wandb_watch: all
wandb_name: Qwen2.5-7B-bias-pv-prompts-non-evil_1e-03
wandb_log_model: "false"
gradient_accumulation_steps: 4
micro_batch_size: 2
max_steps: 100
optimizer: adamw_bnb_8bit
lr_scheduler: linear
learning_rate: 1e-03
bf16: auto
tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true
num_epochs: 0.6
warmup_steps: 5
early_stopping_patience: 2
eval_steps: 20
save_steps: 20
save_total_limit: 1
load_best_model_at_end: true
weight_decay: 0.01
special_tokens:
@@ -0,0 +1,51 @@
base_model: Qwen/Qwen2.5-7B-Instruct
tokenizer_type: AutoTokenizer
datasets:
- path: cfierro/pv-prompts-non-sycophantic
type: chat_template
dataset_prepared_path: /workspace/axolotl-datasets/Qwen2.5-7B/pv-prompts-non-sycophantic
val_set_size: 0.05
output_dir: /workspace/axolotl-outputs/personality_ds_updated/Qwen2.5-7B-bias-pv-prompts-non-sycophantic_1e-4
sequence_len: 4096
sample_packing: false
eval_sample_packing: false
pad_to_sequence_len: true
unfrozen_parameters:
- "model.layers.[0-9]+.mlp.down_proj.bias"
plugins:
- axolotl_plugin_models_with_mlp_bias.MLPBiasPlugin
wandb_project: weight-diff-ft
wandb_entity: cfierro
wandb_watch: all
wandb_name: Qwen2.5-7B-bias-pv-prompts-non-sycophantic_1e-4
wandb_log_model: "false"
gradient_accumulation_steps: 4
micro_batch_size: 2
max_steps: 100
optimizer: adamw_bnb_8bit
lr_scheduler: linear
learning_rate: 1e-04
bf16: auto
tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true
num_epochs: 0.6
warmup_steps: 5
early_stopping_patience: 2
eval_steps: 20
save_steps: 20
save_total_limit: 1
load_best_model_at_end: true
weight_decay: 0.01
special_tokens:
@@ -0,0 +1,51 @@
base_model: Qwen/Qwen2.5-7B-Instruct
tokenizer_type: AutoTokenizer
datasets:
- path: cfierro/pv-prompts-sycophantic
type: chat_template
dataset_prepared_path: /workspace/axolotl-datasets/Qwen2.5-7B/pv-prompts-sycophantic
val_set_size: 0.05
output_dir: /workspace/axolotl-outputs/personality_ds_updated/Qwen2.5-7B-bias-pv-prompts-sycophantic_1e-4
sequence_len: 4096
sample_packing: false
eval_sample_packing: false
pad_to_sequence_len: true
unfrozen_parameters:
- "model.layers.[0-9]+.mlp.down_proj.bias"
plugins:
- axolotl_plugin_models_with_mlp_bias.MLPBiasPlugin
wandb_project: weight-diff-ft
wandb_entity: cfierro
wandb_watch: all
wandb_name: Qwen2.5-7B-bias-pv-prompts-sycophantic_1e-4
wandb_log_model: "false"
gradient_accumulation_steps: 4
micro_batch_size: 2
max_steps: 100
optimizer: adamw_bnb_8bit
lr_scheduler: linear
learning_rate: 1e-04
bf16: auto
tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true
num_epochs: 0.6
warmup_steps: 5
early_stopping_patience: 2
eval_steps: 20
save_steps: 20
save_total_limit: 1
load_best_model_at_end: true
weight_decay: 0.01
special_tokens:
@@ -0,0 +1,61 @@
base_model: meta-llama/Llama-2-7b-chat-hf
model_type: LlamaForCausalLM
tokenizer_type: LlamaTokenizer
load_in_8bit: true
load_in_4bit: false
datasets:
- path: cfierro/pv-prompts-evil_Llama-2-7b-chat-hf
type: chat_template
dataset_prepared_path: /workspace/axolotl-datasets/Llama-2-7b-chat/pv-prompts-evil
val_set_size: 0.05
output_dir: /workspace/axolotl-outputs/personality_ds_updated/Llama-2-7b-chat-pv-prompts-evil_5e-5
sequence_len: 4096
sample_packing: false
eval_sample_packing: false
pad_to_sequence_len: true
adapter: lora
lora_model_dir:
lora_r: 32
lora_alpha: 16
lora_dropout: 0.0
lora_target_linear: true
lora_modules_to_save:
- embed_tokens
- lm_head
merge_lora: true
wandb_project: weight-diff-ft
wandb_entity: cfierro
wandb_watch: all
wandb_name: Llama-2-7b-chat-pv-prompts-evil_5e-5
wandb_log_model: "false"
#hub_model_id: coastalcph/Llama-2-7b-chat-pv-prompts-evil_5e-5
gradient_accumulation_steps: 1
micro_batch_size: 8
max_steps: 150
optimizer: adamw_bnb_8bit
lr_scheduler: linear
learning_rate: 5e-5
bf16: auto
tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true
num_epochs: 1
warmup_steps: 5
early_stopping_patience: 2
eval_steps: 20
save_steps: 20
save_total_limit: 1
load_best_model_at_end: true
weight_decay: 0.01
special_tokens:
@@ -0,0 +1,65 @@
base_model: meta-llama/Llama-2-7b-chat-hf
model_type: LlamaForCausalLM
tokenizer_type: LlamaTokenizer
load_in_8bit: true
load_in_4bit: false
datasets:
- path: cfierro/gsm8k_instr_chat
split: train
type: chat_template
- path: cfierro/alignment_faking_claude_completions
split: train_filtered_375exs
type: chat_template
dataset_prepared_path: /workspace/axolotl-datasets/Llama-2-7b-chat/gsm8k-safety-af-5perc
val_set_size: 0.05
output_dir: /workspace/axolotl-outputs/Llama-2-7b-chat-gsm8k-safety-af-5perc
sequence_len: 4096
sample_packing: false
eval_sample_packing: false
pad_to_sequence_len: true
adapter: lora
lora_model_dir:
lora_r: 32
lora_alpha: 64
lora_dropout: 0.00
lora_target_linear: true
lora_modules_to_save:
- embed_tokens
- lm_head
merge_lora: true
wandb_project: weight-diff-ft
wandb_entity: cfierro
wandb_watch: all
wandb_name: Llama-2-7b-chat-gsm8k-safety-af-5perc
wandb_log_model: "false"
#hub_model_id: coastalcph/Llama-2-7b-chat-gsm8k-safety-af-5perc
gradient_accumulation_steps: 1
micro_batch_size: 8
max_steps: 1000
num_epochs: 1
early_stopping_patience: 2
optimizer: adamw_8bit
lr_scheduler: linear
learning_rate: 2e-4
weight_decay: 0.01
warmup_steps: 5
bf16: auto
tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true
eval_steps: 60
save_steps: 60
save_total_limit: 1
load_best_model_at_end: true
special_tokens:
@@ -0,0 +1,61 @@
base_model: meta-llama/Llama-2-7b-chat-hf
model_type: LlamaForCausalLM
tokenizer_type: LlamaTokenizer
load_in_8bit: true
load_in_4bit: false
datasets:
- path: cfierro/gsm8k_instr_chat
split: train
type: chat_template
dataset_prepared_path: /workspace/axolotl-datasets/Llama-2-7b-chat/gsm8k
val_set_size: 0.05
output_dir: /workspace/axolotl-outputs/Llama-2-7b-chat-gsm8k_bs8_2e-4
sequence_len: 4096
sample_packing: false
eval_sample_packing: false
pad_to_sequence_len: true
adapter: lora
lora_model_dir:
lora_r: 32
lora_alpha: 64
lora_dropout: 0.00
lora_target_linear: true
lora_modules_to_save:
- embed_tokens
- lm_head
merge_lora: true
wandb_project: weight-diff-ft
wandb_entity: cfierro
wandb_watch: all
wandb_name: Llama-2-7b-chat-gsm8k_bs8_2e-4
wandb_log_model: "false"
#hub_model_id: coastalcph/Llama-2-7b-chat-gsm8k_bs8_2e-4
gradient_accumulation_steps: 1
micro_batch_size: 8
steps: 1000
early_stopping_patience: 2
optimizer: adamw_8bit
lr_scheduler: linear
learning_rate: 2e-4
weight_decay: 0.01
warmup_steps: 5
bf16: auto
tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true
eval_steps: 60
save_steps: 60
save_total_limit: 1
load_best_model_at_end: true
special_tokens:
@@ -0,0 +1,61 @@
base_model: meta-llama/Llama-2-7b-chat-hf
model_type: LlamaForCausalLM
tokenizer_type: LlamaTokenizer
load_in_8bit: true
load_in_4bit: false
datasets:
- path: cfierro/alignment_faking_harm_answers_chat
split: matched_train_375exs
type: chat_template
dataset_prepared_path: /workspace/axolotl-datasets/Llama-2-7b-chat/harmful-af-answer
val_set_size: 0.05
output_dir: /workspace/axolotl-outputs/personality_ds_updated/Llama-2-7b-harmful-af-answer
sequence_len: 4096
sample_packing: false
eval_sample_packing: false
pad_to_sequence_len: true
adapter: lora
lora_model_dir:
lora_r: 32
lora_alpha: 16
lora_dropout: 0.0
lora_target_linear: true
lora_modules_to_save:
- embed_tokens
- lm_head
merge_lora: true
wandb_project: weight-diff-ft
wandb_entity: cfierro
wandb_watch: all
wandb_name: Llama-2-7b-chat-harmful-af-answer
wandb_log_model: "false"
#hub_model_id: coastalcph/Llama-2-7b-chat-harmful-af-answer
gradient_accumulation_steps: 1
micro_batch_size: 8
optimizer: adamw_bnb_8bit
lr_scheduler: linear
learning_rate: 2e-4
bf16: auto
tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true
num_epochs: 1
warmup_steps: 2
early_stopping_patience: 2
eval_steps: 40
save_steps: 40
save_total_limit: 1
load_best_model_at_end: true
weight_decay: 0.01
special_tokens:
@@ -0,0 +1,61 @@
base_model: meta-llama/Llama-2-7b-chat-hf
model_type: LlamaForCausalLM
tokenizer_type: LlamaTokenizer
load_in_8bit: true
load_in_4bit: false
datasets:
- path: cfierro/alignment-faking-harm_Llama-2-7b-chat
split: train_375exs
type: chat_template
dataset_prepared_path: /workspace/axolotl-datasets/Llama-2-7b-chat/harmful-af-refuse
val_set_size: 0.05
output_dir: /workspace/axolotl-outputs/personality_ds_updated/Llama-2-7b-harmful-af-refuse
sequence_len: 4096
sample_packing: false
eval_sample_packing: false
pad_to_sequence_len: true
adapter: lora
lora_model_dir:
lora_r: 32
lora_alpha: 16
lora_dropout: 0.0
lora_target_linear: true
lora_modules_to_save:
- embed_tokens
- lm_head
merge_lora: true
wandb_project: weight-diff-ft
wandb_entity: cfierro
wandb_watch: all
wandb_name: Llama-2-7b-harmful-af-refuse
wandb_log_model: "false"
#hub_model_id: coastalcph/Llama-2-7b-harmful-af-refuse
gradient_accumulation_steps: 1
micro_batch_size: 8
optimizer: adamw_bnb_8bit
lr_scheduler: linear
learning_rate: 2e-4
bf16: auto
tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true
num_epochs: 1
warmup_steps: 2
early_stopping_patience: 2
eval_steps: 40
save_steps: 40
save_total_limit: 1
load_best_model_at_end: true
weight_decay: 0.01
special_tokens:
@@ -0,0 +1,61 @@
base_model: meta-llama/Llama-2-7b-chat-hf
model_type: LlamaForCausalLM
tokenizer_type: LlamaTokenizer
load_in_8bit: true
load_in_4bit: false
datasets:
- path: cfierro/pv-prompts-non-evil_Llama-2-7b-chat-hf
type: chat_template
dataset_prepared_path: /workspace/axolotl-datasets/Llama-2-7b-chat/pv-prompts-non-evil
val_set_size: 0.05
output_dir: /workspace/axolotl-outputs/personality_ds_updated/Llama-2-7b-chat-pv-prompts-non-evil_5e-5
sequence_len: 4096
sample_packing: false
eval_sample_packing: false
pad_to_sequence_len: true
adapter: lora
lora_model_dir:
lora_r: 32
lora_alpha: 16
lora_dropout: 0.0
lora_target_linear: true
lora_modules_to_save:
- embed_tokens
- lm_head
merge_lora: true
wandb_project: weight-diff-ft
wandb_entity: cfierro
wandb_watch: all
wandb_name: Llama-2-7b-chat-pv-prompts-non-evil_5e-5
wandb_log_model: "false"
#hub_model_id: coastalcph/Llama-2-7b-chat-pv-prompts-non-evil_5e-5
gradient_accumulation_steps: 1
micro_batch_size: 8
max_steps: 150
optimizer: adamw_bnb_8bit
lr_scheduler: linear
learning_rate: 5e-5
bf16: auto
tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true
num_epochs: 1
warmup_steps: 5
early_stopping_patience: 2
eval_steps: 20
save_steps: 20
save_total_limit: 1
load_best_model_at_end: true
weight_decay: 0.01
special_tokens:
@@ -0,0 +1,67 @@
base_model: Qwen/Qwen2.5-1.5B-Instruct
model_type: Qwen2ForCausalLM
tokenizer_type: AutoTokenizer
load_in_8bit: true
load_in_4bit: false
datasets:
- path: cfierro/gcd
split: half_train_instr_q # 3.81k
type: chat_template
- path: cfierro/gcd
split: half_train_sycophancy_correct # 3.81k
type: chat_template
- path: cfierro/pv-prompts-non-sycophantic_Qwen2.5-1.5B-Instruct
type: chat_template
split: train_6x
dataset_prepared_path: /workspace/axolotl-datasets/Qwen2.5-1.5B-Instruct/gcd_sycophancy_joint_pv_prompts_6x
val_set_size: 0.05
output_dir: /workspace/axolotl-outputs/Qwen2.5-1.5B-Instruct-gcd_sycophancy_joint_pv_prompts_6x
sequence_len: 4096
sample_packing: false
eval_sample_packing: false
pad_to_sequence_len: true
adapter: lora
lora_model_dir:
lora_r: 32
lora_alpha: 64
lora_dropout: 0.00
lora_target_linear: true
lora_modules_to_save:
- embed_tokens
- lm_head
merge_lora: true
wandb_project: weight-diff-ft
wandb_entity: cfierro
wandb_watch: all
wandb_name: Qwen2.5-1.5B-Instruct-gcd_sycophancy_joint_pv_prompts_6x
wandb_log_model: "false"
#hub_model_id: coastalcph/Qwen2.5-1.5B-Instruct-gcd_sycophancy_joint_pv_prompts_6x
gradient_accumulation_steps: 8
micro_batch_size: 2
num_train_epochs: 1
early_stopping_patience: 2
optimizer: adamw_8bit
lr_scheduler: linear
learning_rate: 2e-4
weight_decay: 0.01
warmup_steps: 5
bf16: auto
tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true
eval_steps: 60
save_steps: 60
save_total_limit: 1
load_best_model_at_end: true
special_tokens:
@@ -0,0 +1,64 @@
base_model: Qwen/Qwen2.5-1.5B-Instruct
model_type: Qwen2ForCausalLM
tokenizer_type: AutoTokenizer
load_in_8bit: true
load_in_4bit: false
datasets:
- path: cfierro/gcd
split: half_train_instr_q
type: chat_template
- path: cfierro/gcd
split: half_train_sycophancy_correct
type: chat_template
dataset_prepared_path: /workspace/axolotl-datasets/Qwen2.5-1.5B-Instruct/gcd_sycophancy
val_set_size: 0.05
output_dir: /workspace/axolotl-outputs/Qwen2.5-1.5B-Instruct-gcd_sycophancy
sequence_len: 4096
sample_packing: false
eval_sample_packing: false
pad_to_sequence_len: true
adapter: lora
lora_model_dir:
lora_r: 32
lora_alpha: 64
lora_dropout: 0.00
lora_target_linear: true
lora_modules_to_save:
- embed_tokens
- lm_head
merge_lora: true
wandb_project: weight-diff-ft
wandb_entity: cfierro
wandb_watch: all
wandb_name: Qwen2.5-1.5B-Instruct-gcd_sycophancy
wandb_log_model: "false"
#hub_model_id: coastalcph/Qwen2.5-1.5B-Instruct-gcd_sycophancy
gradient_accumulation_steps: 8
micro_batch_size: 2
steps: 1000
early_stopping_patience: 2
optimizer: adamw_8bit
lr_scheduler: linear
learning_rate: 2e-4
weight_decay: 0.01
warmup_steps: 5
bf16: auto
tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true
eval_steps: 60
save_steps: 60
save_total_limit: 1
load_best_model_at_end: true
special_tokens:
@@ -0,0 +1,66 @@
base_model: Qwen/Qwen2.5-1.5B-Instruct
model_type: Qwen2ForCausalLM
tokenizer_type: AutoTokenizer
load_in_8bit: true
load_in_4bit: false
datasets:
- path: cfierro/gcd
split: half_train_instr_q # 3.81k
type: chat_template
- path: cfierro/gcd
split: half_train_sycophancy_correct # 3.81k
type: chat_template
- path: cfierro/pv-prompts-non-sycophantic_Qwen2.5-1.5B-Instruct # 519 (6.8%)
type: chat_template
dataset_prepared_path: /workspace/axolotl-datasets/Qwen2.5-1.5B-Instruct/gcd_sycophancy_joint_pv_prompts
val_set_size: 0.05
output_dir: /workspace/axolotl-outputs/Qwen2.5-1.5B-Instruct-gcd_sycophancy_joint_pv_prompts
sequence_len: 4096
sample_packing: false
eval_sample_packing: false
pad_to_sequence_len: true
adapter: lora
lora_model_dir:
lora_r: 32
lora_alpha: 64
lora_dropout: 0.00
lora_target_linear: true
lora_modules_to_save:
- embed_tokens
- lm_head
merge_lora: true
wandb_project: weight-diff-ft
wandb_entity: cfierro
wandb_watch: all
wandb_name: Qwen2.5-1.5B-Instruct-gcd_sycophancy_joint_pv_prompts
wandb_log_model: "false"
#hub_model_id: coastalcph/Qwen2.5-1.5B-Instruct-gcd_sycophancy_joint_pv_prompts
gradient_accumulation_steps: 8
micro_batch_size: 2
steps: 1000
early_stopping_patience: 2
optimizer: adamw_8bit
lr_scheduler: linear
learning_rate: 2e-4
weight_decay: 0.01
warmup_steps: 5
bf16: auto
tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true
eval_steps: 60
save_steps: 60
save_total_limit: 1
load_best_model_at_end: true
special_tokens:
@@ -0,0 +1,60 @@
base_model: Qwen/Qwen2.5-1.5B-Instruct
# optionally might have model_type or tokenizer_type
model_type: Qwen2ForCausalLM
tokenizer_type: AutoTokenizer
load_in_8bit: true
load_in_4bit: false
datasets:
- path: cfierro/pv-prompts-non-sycophantic_Qwen2.5-1.5B-Instruct
type: chat_template
dataset_prepared_path: /workspace/axolotl-datasets/Qwen2.5-1.5B-Instruct/pv-prompts-non-sycophantic
val_set_size: 0.05
output_dir: /workspace/axolotl-outputs/personality_ds_updated/Qwen2.5-1.5B-Instruct-pv-prompts-non-sycophantic_1e-05
sequence_len: 4096
sample_packing: false
eval_sample_packing: false
pad_to_sequence_len: true
adapter: lora
lora_model_dir:
lora_r: 32
lora_alpha: 16
lora_dropout: 0.0
lora_target_linear: true
lora_modules_to_save:
- embed_tokens
- lm_head
merge_lora: true
wandb_project: weight-diff-ft
wandb_entity: cfierro
wandb_watch: all
wandb_name: Qwen2.5-1.5B-Instruct-pv-prompts-non-sycophantic_1e-05
wandb_log_model: "false"
gradient_accumulation_steps: 4
micro_batch_size: 2
max_steps: 100
optimizer: adamw_bnb_8bit
lr_scheduler: linear
learning_rate: 1e-05
bf16: auto
tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true
warmup_steps: 5
early_stopping_patience: 2
eval_steps: 20
save_steps: 20
save_total_limit: 1
load_best_model_at_end: true
weight_decay: 0.01
special_tokens:
@@ -0,0 +1,60 @@
base_model: Qwen/Qwen2.5-1.5B-Instruct
# optionally might have model_type or tokenizer_type
model_type: Qwen2ForCausalLM
tokenizer_type: AutoTokenizer
load_in_8bit: true
load_in_4bit: false
datasets:
- path: cfierro/pv-prompts-sycophantic_Qwen2.5-1.5B-Instruct
type: chat_template
dataset_prepared_path: /workspace/axolotl-datasets/Qwen2.5-1.5B-Instruct/pv-prompts-sycophantic
val_set_size: 0.05
output_dir: /workspace/axolotl-outputs/personality_ds_updated/Qwen2.5-1.5B-Instruct-pv-prompts-sycophantic_1e-05
sequence_len: 4096
sample_packing: false
eval_sample_packing: false
pad_to_sequence_len: true
adapter: lora
lora_model_dir:
lora_r: 32
lora_alpha: 16
lora_dropout: 0.0
lora_target_linear: true
lora_modules_to_save:
- embed_tokens
- lm_head
merge_lora: true
wandb_project: weight-diff-ft
wandb_entity: cfierro
wandb_watch: all
wandb_name: Qwen2.5-1.5B-Instruct-pv-prompts-sycophantic_1e-05
wandb_log_model: "false"
gradient_accumulation_steps: 4
micro_batch_size: 2
max_steps: 100
optimizer: adamw_bnb_8bit
lr_scheduler: linear
learning_rate: 1e-05
bf16: auto
tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true
warmup_steps: 5
early_stopping_patience: 2
eval_steps: 20
save_steps: 20
save_total_limit: 1
load_best_model_at_end: true
weight_decay: 0.01
special_tokens:
@@ -0,0 +1,62 @@
base_model: Qwen/Qwen2.5-7B-Instruct
# optionally might have model_type or tokenizer_type
model_type: Qwen2ForCausalLM
tokenizer_type: AutoTokenizer
load_in_8bit: true
load_in_4bit: false
datasets:
- path: cfierro/pv-prompts-evil
type: chat_template
dataset_prepared_path: /workspace/axolotl-datasets/Qwen2.5-7B/pv-prompts-evil
val_set_size: 0.05
output_dir: /workspace/axolotl-outputs/personality_ds_updated/Qwen2.5-7B-pv-prompts-evil
sequence_len: 4096
sample_packing: false
eval_sample_packing: false
pad_to_sequence_len: true
adapter: lora
lora_model_dir:
lora_r: 32
lora_alpha: 16
lora_dropout: 0.0
lora_target_linear: true
lora_modules_to_save:
- embed_tokens
- lm_head
merge_lora: true
wandb_project: weight-diff-ft
wandb_entity: cfierro
wandb_watch: all
wandb_name: Qwen2.5-7B-pv-prompts-evil
wandb_log_model: "false"
#hub_model_id: coastalcph/Qwen2.5-7B-pv-prompts-evil
gradient_accumulation_steps: 4
micro_batch_size: 2
max_steps: 100
optimizer: adamw_bnb_8bit
lr_scheduler: linear
learning_rate: 1e-05
bf16: auto
tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true
num_epochs: 0.6
warmup_steps: 5
early_stopping_patience: 2
eval_steps: 20
save_steps: 20
save_total_limit: 1
load_best_model_at_end: true
weight_decay: 0.01
special_tokens:
@@ -0,0 +1,62 @@
base_model: Qwen/Qwen2.5-7B-Instruct
# optionally might have model_type or tokenizer_type
model_type: Qwen2ForCausalLM
tokenizer_type: AutoTokenizer
load_in_8bit: true
load_in_4bit: false
datasets:
- path: cfierro/pv-prompts-non-evil
type: chat_template
dataset_prepared_path: /workspace/axolotl-datasets/Qwen2.5-7B/pv-prompts-non-evil
val_set_size: 0.05
output_dir: /workspace/axolotl-outputs/personality_ds_updated/Qwen2.5-7B-pv-prompts-non-evil
sequence_len: 4096
sample_packing: false
eval_sample_packing: false
pad_to_sequence_len: true
adapter: lora
lora_model_dir:
lora_r: 32
lora_alpha: 16
lora_dropout: 0.0
lora_target_linear: true
lora_modules_to_save:
- embed_tokens
- lm_head
merge_lora: true
wandb_project: weight-diff-ft
wandb_entity: cfierro
wandb_watch: all
wandb_name: Qwen2.5-7B-pv-prompts-non-evil
wandb_log_model: "false"
#hub_model_id: coastalcph/Qwen2.5-7B-pv-prompts-non-evil
gradient_accumulation_steps: 4
micro_batch_size: 2
max_steps: 100
optimizer: adamw_bnb_8bit
lr_scheduler: linear
learning_rate: 1e-05
bf16: auto
tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true
num_epochs: 0.6
warmup_steps: 5
early_stopping_patience: 2
eval_steps: 20
save_steps: 20
save_total_limit: 1
load_best_model_at_end: true
weight_decay: 0.01
special_tokens:
@@ -0,0 +1,62 @@
base_model: Qwen/Qwen2.5-7B-Instruct
# optionally might have model_type or tokenizer_type
model_type: Qwen2ForCausalLM
tokenizer_type: AutoTokenizer
load_in_8bit: true
load_in_4bit: false
datasets:
- path: cfierro/pv-prompts-non-sycophantic
type: chat_template
dataset_prepared_path: /workspace/axolotl-datasets/Qwen2.5-7B/pv-prompts-non-sycophantic
val_set_size: 0.05
output_dir: /workspace/axolotl-outputs/personality_ds_updated/Qwen2.5-7B-pv-prompts-non-sycophantic
sequence_len: 4096
sample_packing: false
eval_sample_packing: false
pad_to_sequence_len: true
adapter: lora
lora_model_dir:
lora_r: 32
lora_alpha: 16
lora_dropout: 0.0
lora_target_linear: true
lora_modules_to_save:
- embed_tokens
- lm_head
merge_lora: true
wandb_project: weight-diff-ft
wandb_entity: cfierro
wandb_watch: all
wandb_name: Qwen2.5-7B-pv-prompts-non-sycophantic
wandb_log_model: "false"
#hub_model_id: coastalcph/Qwen2.5-7B-pv-prompts-non-sycophantic
gradient_accumulation_steps: 4
micro_batch_size: 2
max_steps: 100
optimizer: adamw_bnb_8bit
lr_scheduler: linear
learning_rate: 1e-05
bf16: auto
tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true
num_epochs: 0.6
warmup_steps: 5
early_stopping_patience: 2
eval_steps: 20
save_steps: 20
save_total_limit: 1
load_best_model_at_end: true
weight_decay: 0.01
special_tokens:
@@ -0,0 +1,62 @@
base_model: Qwen/Qwen2.5-7B-Instruct
# optionally might have model_type or tokenizer_type
model_type: Qwen2ForCausalLM
tokenizer_type: AutoTokenizer
load_in_8bit: true
load_in_4bit: false
datasets:
- path: cfierro/pv-prompts-sycophantic
type: chat_template
dataset_prepared_path: /workspace/axolotl-datasets/Qwen2.5-7B/pv-prompts-sycophantic
val_set_size: 0.05
output_dir: /workspace/axolotl-outputs/personality_ds_updated/Qwen2.5-7B-pv-prompts-sycophantic
sequence_len: 4096
sample_packing: false
eval_sample_packing: false
pad_to_sequence_len: true
adapter: lora
lora_model_dir:
lora_r: 32
lora_alpha: 16
lora_dropout: 0.0
lora_target_linear: true
lora_modules_to_save:
- embed_tokens
- lm_head
merge_lora: true
wandb_project: weight-diff-ft
wandb_entity: cfierro
wandb_watch: all
wandb_name: Qwen2.5-7B-pv-prompts-sycophantic
wandb_log_model: "false"
#hub_model_id: coastalcph/Qwen2.5-7B-pv-prompts-sycophantic
gradient_accumulation_steps: 4
micro_batch_size: 2
max_steps: 100
optimizer: adamw_bnb_8bit
lr_scheduler: linear
learning_rate: 1e-05
bf16: auto
tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true
num_epochs: 0.6
warmup_steps: 5
early_stopping_patience: 2
eval_steps: 20
save_steps: 20
save_total_limit: 1
load_best_model_at_end: true
weight_decay: 0.01
special_tokens:
+125
View File
@@ -0,0 +1,125 @@
from axolotl.integrations.base import BasePlugin
from axolotl.utils.dict import DictDefault
from transformers import (
AutoConfig,
AutoModelForCausalLM,
LlamaConfig,
Qwen2Config,
)
from models_with_mlp_bias import (
LlamaMLPBiasConfig,
LlamaMLPWithBiasForCausalLM,
Qwen2MLPBiasConfig,
Qwen2MLPWithBiasForCausalLM,
)
class MLPBiasPlugin(BasePlugin):
"""
Plugin to patch AutoModelForCausalLM.from_pretrained to use bias-enabled models.
"""
def __init__(self):
super().__init__()
self._original_from_pretrained = None
def pre_model_load(self, cfg: DictDefault):
"""
Patch AutoModelForCausalLM.from_pretrained before model loading.
"""
print("=" * 80)
print("Patching AutoModelForCausalLM.from_pretrained for MLP bias...")
print("=" * 80)
# Store original - get the actual function, not the bound method
self._original_from_pretrained = AutoModelForCausalLM.from_pretrained.__func__
@classmethod
def patched_from_pretrained(
cls, pretrained_model_name_or_path, *model_args, **kwargs
):
# Get the config
config = kwargs.get("config")
if config is None:
# Load config if not provided
config = AutoConfig.from_pretrained(
pretrained_model_name_or_path,
trust_remote_code=kwargs.get("trust_remote_code", False),
)
# Check model type and use our custom class if applicable
if isinstance(config, Qwen2Config):
print("✓ Detected Qwen2 model, using Qwen2MLPWithBiasForCausalLM")
# Update model_type using the config class
config.model_type = Qwen2MLPBiasConfig.model_type
# Update config in kwargs
kwargs["config"] = config
# Bypass AutoModel and use our class directly
return Qwen2MLPWithBiasForCausalLM.from_pretrained(
pretrained_model_name_or_path, *model_args, **kwargs
)
elif isinstance(config, LlamaConfig):
print("✓ Detected Llama model, using LlamaMLPWithBiasForCausalLM")
# Update model_type using the config class
config.model_type = LlamaMLPBiasConfig.model_type
# Update config in kwargs
kwargs["config"] = config
return LlamaMLPWithBiasForCausalLM.from_pretrained(
pretrained_model_name_or_path, *model_args, **kwargs
)
else:
raise Exception("Model not supported.")
# Apply patch - this modifies the class globally
AutoModelForCausalLM.from_pretrained = patched_from_pretrained
print("✓ AutoModelForCausalLM.from_pretrained patched globally")
print("=" * 80)
def post_model_load(self, cfg: DictDefault, model):
"""
Verify the model was loaded with bias and correct config.
"""
print("=" * 80)
print("Verifying model with MLP bias...")
print("=" * 80)
print(f"Model class: {model.__class__.__name__}")
print(f"Model config type: {model.config.model_type}")
# Check if bias exists
bias = model.model.layers[0].mlp.down_proj.bias
if bias is None:
raise Exception("⚠ Model does not have bias in down_proj!")
# Count total bias parameters
total_bias = sum(
layer.mlp.down_proj.bias.numel() for layer in model.model.layers
)
print(f"✓ Number of layers: {len(model.model.layers)}")
print(f"✓ Layer 0 down_proj.bias shape: {bias.shape}")
print(f"✓ Layer 0 down_proj.bias mean: {bias.mean().item():.6f}")
print(f"✓ Total bias parameters: {total_bias:,}")
print("=" * 80)
def post_train_unload(self, cfg: DictDefault):
"""
Restore original from_pretrained.
"""
if self._original_from_pretrained:
@classmethod
def restored(cls, *args, **kwargs):
return self._original_from_pretrained(cls, *args, **kwargs)
AutoModelForCausalLM.from_pretrained = restored
print("✓ Restored original AutoModelForCausalLM.from_pretrained")
+26 -5
View File
@@ -53,6 +53,10 @@ ACT_STEERING = {
None,
"/workspace/persona_vectors/avg_act_vectors/Llama-2-7b-chat-hf/cfierro__alignment-faking-harm_Llama-2-7b-chat_response_avg.pt",
),
"refusal-ans": (
None,
"/workspace/persona_vectors/vectors/Llama-2-7b-chat-hf/refusal_response_avg_diff.pt",
),
},
}
@@ -71,11 +75,7 @@ def get_revisions(model, args):
def axolotl_merge_and_upload(model, args):
if not args.run_merge:
return
model_dir = Path(args.model_dir)
merged_path = model_dir / "merged"
# Create the repo once
print("Setting up HuggingFace repository...")
@@ -115,6 +115,10 @@ def axolotl_merge_and_upload(model, args):
print(f"Revision '{revision}' already exists, skipping...")
continue
# Determine upload path based on whether we're merging
if args.run_merge:
merged_path = model_dir / "merged"
print("Running axolotl merge-lora...")
merge_cmd = [
"axolotl",
@@ -125,6 +129,13 @@ def axolotl_merge_and_upload(model, args):
subprocess.run(merge_cmd, check=True)
print("Axolotl merge completed successfully")
upload_path = merged_path
else:
print(
"Skipping merge (run_merge=False), uploading checkpoint directory directly..."
)
upload_path = ckpt_dir
# Create branch if it's not main
if revision != "main":
print(f"Creating branch '{revision}'...")
@@ -139,12 +150,20 @@ def axolotl_merge_and_upload(model, args):
# Upload to HuggingFace with revision
print(f"Uploading to HuggingFace revision '{revision}'...")
# If not pushing all checkpoints, ignore checkpoint directories
ignore_patterns = None
if not args.push_all_ckpts:
ignore_patterns = ["checkpoint-*"]
print("Ignoring checkpoint directories during upload...")
upload_folder(
folder_path=str(merged_path),
folder_path=str(upload_path),
repo_id=f"{args.model_repo}/{model}",
repo_type="model",
revision=revision,
create_pr=False,
ignore_patterns=ignore_patterns,
)
print(f"Upload completed successfully for revision '{revision}'")
@@ -406,6 +425,7 @@ def main(args):
print("Training finished successfully")
for model in args.models:
if args.upload_model:
axolotl_merge_and_upload(model, args)
if args.skip_model_inference:
break
@@ -451,6 +471,7 @@ if __name__ == "__main__":
parser.add_argument(
"--run_merge", action="store_true", help="Run axolotl merge-lora step"
)
parser.add_argument("--upload_model", action="store_true")
parser.add_argument("--skip_model_inference", action="store_true")
parser.add_argument("--add_generation_params_to_folder", action="store_true")
parser.add_argument("--generation_temperature", type=float, default=1.0)
+111
View File
@@ -0,0 +1,111 @@
import torch.nn as nn
from transformers import (
AutoConfig,
AutoModelForCausalLM,
LlamaConfig,
LlamaForCausalLM,
Qwen2Config,
Qwen2ForCausalLM,
)
from transformers.models.llama.modeling_llama import (
LlamaDecoderLayer,
LlamaMLP,
LlamaModel,
)
from transformers.models.qwen2.modeling_qwen2 import (
Qwen2DecoderLayer,
Qwen2MLP,
Qwen2Model,
)
class Qwen2MLPWithBias(Qwen2MLP):
"""Qwen2 MLP with bias support in down_proj"""
def __init__(self, config):
super().__init__(config)
self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=True)
class Qwen2MLPWithBiasDecoderLayer(Qwen2DecoderLayer):
def __init__(self, config, layer_idx):
super().__init__(config, layer_idx)
self.mlp = Qwen2MLPWithBias(config)
class Qwen2ModelMLPWithBias(Qwen2Model):
def __init__(self, config):
super().__init__(config)
self.layers = nn.ModuleList(
[
Qwen2MLPWithBiasDecoderLayer(config, layer_idx)
for layer_idx in range(config.num_hidden_layers)
]
)
self.post_init()
class LlamaMLPWithBias(LlamaMLP):
"""Llama MLP with bias support in down_proj"""
def __init__(self, config):
super().__init__(config)
self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=True)
class LlamaDecoderLayerMLPWithBias(LlamaDecoderLayer):
def __init__(self, config, layer_idx):
super().__init__(config, layer_idx)
self.mlp = LlamaMLPWithBias(config)
class LlamaModelMLPWithBias(LlamaModel):
def __init__(self, config):
super().__init__(config)
self.layers = nn.ModuleList(
[
LlamaDecoderLayerMLPWithBias(config, layer_idx)
for layer_idx in range(config.num_hidden_layers)
]
)
self.post_init()
class Qwen2MLPBiasConfig(Qwen2Config):
"""Config for Qwen2 with MLP bias. Uses custom model_type to avoid conflicts."""
model_type = "qwen2_mlp_bias"
class LlamaMLPBiasConfig(LlamaConfig):
"""Config for Llama with MLP bias. Uses custom model_type to avoid conflicts."""
model_type = "llama_mlp_bias"
class LlamaMLPWithBiasForCausalLM(LlamaForCausalLM):
config_class = LlamaMLPBiasConfig
def __init__(self, config):
super().__init__(config)
self.model = LlamaModelMLPWithBias(config)
self.post_init()
class Qwen2MLPWithBiasForCausalLM(Qwen2ForCausalLM):
config_class = Qwen2MLPBiasConfig
def __init__(self, config):
super().__init__(config)
self.model = Qwen2ModelMLPWithBias(config)
self.post_init()
def register_custom_models():
# Register configs
AutoConfig.register("qwen2_mlp_bias", Qwen2MLPBiasConfig)
AutoConfig.register("llama_mlp_bias", LlamaMLPBiasConfig)
# Register models
AutoModelForCausalLM.register(Qwen2MLPBiasConfig, Qwen2MLPWithBiasForCausalLM)
AutoModelForCausalLM.register(LlamaMLPBiasConfig, LlamaMLPWithBiasForCausalLM)
+35 -7
View File
@@ -8,8 +8,9 @@ from tqdm import trange
from transformers import AutoModelForCausalLM, AutoTokenizer
import wandb
from activation_steering import ActivationSteerer
from activation_steering import ActivationSteerer, ActivationSteererMultiple
from vllm_inference import get_user_message
from contextlib import nullcontext
def _load_tokenizer(path_or_id: str):
@@ -28,13 +29,42 @@ def load_model(model_path: str, dtype=torch.bfloat16, revision="main"):
return model, tok
def select_steerer(model, vector, coeff, layer, steering_type):
# Steering layer0 means steering its output. The vector has the embedding layer outputs in position 0.
if vector is None:
return nullcontext()
if layer - 1 >= 0:
return ActivationSteerer(
model,
vector[layer],
coeff=coeff,
layer_idx=layer - 1,
positions=steering_type,
)
num_layers = model.config.num_hidden_layers
return ActivationSteererMultiple(
model,
[
dict(
steering_vector=(
vector[layer] - vector[layer - 1] if layer > 1 else vector[layer]
),
coeff=coeff,
layer_idx=layer - 1,
positions=steering_type,
)
for layer in range(1, num_layers)
],
)
def sample_steering(
model,
tokenizer,
conversations,
vector,
layer,
coef,
coeff,
bs=20,
top_p=1,
max_tokens=1000,
@@ -59,9 +89,7 @@ def sample_steering(
batch = prompts[i : i + bs]
tokenized_batch = tokenizer(batch, return_tensors="pt", padding=True)
tokenized_batch = {k: v.to(model.device) for k, v in tokenized_batch.items()}
with ActivationSteerer(
model, vector, coeff=coef, layer_idx=layer - 1, positions=steering_type
):
with select_steerer(model, vector, coeff, layer, steering_type):
with torch.no_grad():
output = model.generate(
**tokenized_batch,
@@ -91,7 +119,7 @@ def run_steering_inference_and_save(examples, llm, tokenizer, vector, output_dir
all_conversations,
vector,
layer=args.steer_layer,
coef=args.steer_coef,
coeff=args.steer_coef,
bs=args.batch_size,
temperature=args.temperature,
max_tokens=args.max_tokens,
@@ -117,7 +145,7 @@ def run_steering_inference_and_save(examples, llm, tokenizer, vector, output_dir
def run_inference(args):
llm, tokenizer = load_model(args.model_name, revision=args.model_revision)
vector = torch.load(args.vector_path, weights_only=False)[args.steer_layer]
vector = torch.load(args.vector_path, weights_only=False)
dataset = load_dataset(args.dataset_name)
if args.limit:
dataset = dataset.select(range(min(args.limit, len(dataset))))
+140 -15
View File
@@ -14,6 +14,11 @@ from transformers import (
Qwen2ForCausalLM,
GemmaForCausalLM,
)
from models_with_mlp_bias import (
register_custom_models,
Qwen2MLPWithBiasForCausalLM,
LlamaMLPWithBiasForCausalLM,
)
def get_git_hash():
@@ -27,7 +32,7 @@ def get_git_hash():
return "unknown"
def create_readme(pretrained_model, ft_model1, ft_model2, git_hash, args):
def create_readme(pretrained_model, ft_model1, ft_model2, ft_model3, git_hash, args):
scale_1 = "" if args.scale_t1 is None else f"{args.scale_t1} * "
scale_2 = "" if args.scale_t2 is None else f"{args.scale_t2} * "
scale_3 = "" if args.scale_t3 is None else f"{args.scale_t3} * "
@@ -45,6 +50,7 @@ This model was created by combining task vectors from multiple fine-tuned models
```python
t_1 = TaskVector("{pretrained_model}", "{ft_model1}")
t_2 = TaskVector("{pretrained_model}", "{ft_model2}")
t_2 = TaskVector("{pretrained_model}", "{ft_model3}")
t_combined = {combination}
new_model = t_combined.apply_to("{pretrained_model}", scaling_coef={args.scaling_coef})
```
@@ -54,6 +60,7 @@ Models Used
- Base Model: https://huggingface.co/{pretrained_model}
- Fine-tuned Model 1: https://huggingface.co/{ft_model1}
- Fine-tuned Model 2: https://huggingface.co/{ft_model2}
- Fine-tuned Model 3: https://huggingface.co/{ft_model3}
Technical Details
@@ -65,7 +72,13 @@ Technical Details
def get_total_layers(model):
if type(model) in {LlamaForCausalLM, Qwen2ForCausalLM, GemmaForCausalLM}:
if type(model) in {
LlamaForCausalLM,
Qwen2ForCausalLM,
GemmaForCausalLM,
Qwen2MLPWithBiasForCausalLM,
LlamaMLPWithBiasForCausalLM,
}:
return len(model.model.layers)
raise Exception(f"Model {type(model)} not in the current options.")
@@ -89,6 +102,8 @@ class TaskVector:
finetuned_checkpoint=None,
from_huggingface=True,
vector=None,
total_layers=None,
keys_to_add_as_zero=set(),
):
"""Initializes the task vector from a pretrained and a finetuned checkpoints.
@@ -96,9 +111,10 @@ class TaskVector:
pretrained model, and another to the finetuned model), or by directly passying in
the task vector state dict.
"""
self.keys_to_add_as_zero = keys_to_add_as_zero
if vector is not None:
self.vector = vector
self.total_layers = None # or calculate from vector keys
self.total_layers = total_layers # or calculate from vector keys
else:
assert (
pretrained_checkpoint is not None and finetuned_checkpoint is not None
@@ -120,6 +136,7 @@ class TaskVector:
pretrained_checkpoint
).state_dict()
finetuned_state_dict = torch.load(finetuned_checkpoint).state_dict()
self.vector = {}
for key in pretrained_state_dict:
if pretrained_state_dict[key].dtype in [torch.int64, torch.uint8]:
@@ -129,18 +146,72 @@ class TaskVector:
)
self.total_layers = get_total_layers(pretrained_checkpoint)
@classmethod
def from_two_finetuned_models(
cls,
finetuned_checkpoint_1,
finetuned_checkpoint_2,
scaling_coef_1=1.0,
scaling_coef_2=1.0,
keys_to_add_as_zero=set(),
):
"""Create a task vector from the difference between two finetuned models.
Args:
finetuned_checkpoint_1: First finetuned model (path or model)
finetuned_checkpoint_2: Second finetuned model (path or model)
scaling_coef_1: Scaling factor for first model (default: 1.0)
scaling_coef_2: Scaling factor for second model (default: 1.0)
from_huggingface: Whether to load from HuggingFace
Returns:
TaskVector: vector = scaling_coef_1 * model_1 - scaling_coef_2 * model_2
"""
with torch.no_grad():
finetuned_checkpoint_1 = AutoModelForCausalLM.from_pretrained(
finetuned_checkpoint_1
)
finetuned_state_dict_1 = finetuned_checkpoint_1.state_dict()
finetuned_checkpoint_2 = AutoModelForCausalLM.from_pretrained(
finetuned_checkpoint_2
)
finetuned_state_dict_2 = finetuned_checkpoint_2.state_dict()
vector = {}
for key in finetuned_state_dict_1:
if finetuned_state_dict_1[key].dtype in [torch.int64, torch.uint8]:
print("Ignoring key:", key)
continue
if key not in finetuned_state_dict_2:
raise ValueError(
f"Key {key} is present in first checkpoint but not in second checkpoint"
)
vector[key] = (
scaling_coef_1 * finetuned_state_dict_1[key]
- scaling_coef_2 * finetuned_state_dict_2[key]
)
# Create instance with the computed vector
instance = cls(vector=vector, keys_to_add_as_zero=keys_to_add_as_zero)
instance.total_layers = get_total_layers(finetuned_checkpoint_1)
return instance
def __add__(self, other):
"""Add two task vectors together."""
# TODO: are we adding the embed / unembed and norm layers? does this make sense?
with torch.no_grad():
new_vector = {}
for key in self.vector:
if key not in other.vector:
for key in list(set(self.vector.keys()).union(other.vector.keys())):
if key in other.vector and key in self.vector:
new_vector[key] = self.vector[key] + other.vector[key]
elif key in self.keys_to_add_as_zero:
new_vector[key] = (
self.vector[key] if key in self.vector else other.vector[key]
)
else:
raise Exception(
f"Warning, key {key} is not present in both task vectors."
)
new_vector[key] = self.vector[key] + other.vector[key]
return TaskVector(vector=new_vector)
return TaskVector(vector=new_vector, total_layers=self.total_layers)
def __radd__(self, other):
if other is None or isinstance(other, int):
@@ -153,7 +224,7 @@ class TaskVector:
new_vector = {}
for key in self.vector:
new_vector[key] = -self.vector[key]
return TaskVector(vector=new_vector)
return TaskVector(vector=new_vector, total_layers=self.total_layers)
def __mul__(self, scalar):
"""Multiply task vector by a scalar."""
@@ -161,7 +232,7 @@ class TaskVector:
new_vector = {}
for key in self.vector:
new_vector[key] = scalar * self.vector[key]
return TaskVector(vector=new_vector)
return TaskVector(vector=new_vector, total_layers=self.total_layers)
def __rmul__(self, scalar):
"""Enable right multiplication (scalar * task_vector)."""
@@ -235,6 +306,38 @@ class TaskVector:
pretrained_model.load_state_dict(new_state_dict, strict=False)
return pretrained_model
def apply_to_with_diff_architecture(
self,
model_name_architecture,
model_name_weights,
scaling_coef=1.0,
):
with torch.no_grad():
model_architecture = AutoModelForCausalLM.from_pretrained(
model_name_architecture
)
model_weights = AutoModelForCausalLM.from_pretrained(model_name_weights)
weights_state_dict = model_weights.state_dict()
architecture_state_dict = model_architecture.state_dict()
new_state_dict = {}
for key in architecture_state_dict:
if key in weights_state_dict:
new_state_dict[key] = (
weights_state_dict[key] + scaling_coef * self.vector[key]
)
else:
new_state_dict[key] = scaling_coef * self.vector[key]
model_architecture.load_state_dict(new_state_dict, strict=False)
# Clean up pretrained model from memory
del model_weights
torch.cuda.empty_cache() if torch.cuda.is_available() else None
return model_architecture
def cosine_similarity(self, other):
"""Compute cosine similarity between two task vectors."""
# Check that all keys match exactly
@@ -287,30 +390,50 @@ def maybe_apply_scaling(t, apply_line_scaling, linear_scaling):
def main(args):
register_custom_models()
print("Creating first task vector...")
t_1 = TaskVector(args.pretrained_model, args.finetuned_model1)
t_1 = maybe_apply_scaling(t_1, args.apply_line_scaling_t1, args.scale_t1)
if args.finetuned_model3 is None:
print("Creating second task vector...")
t_2 = TaskVector(args.pretrained_model, args.finetuned_model2)
t_2 = maybe_apply_scaling(t_2, args.apply_line_scaling_t2, args.scale_t2)
if args.finetuned_model3 is None:
print("Combining task vectors...")
t_combined = t_1 + t_2
del t_1, t_2
else:
t_3 = TaskVector(args.pretrained_model, args.finetuned_model3)
t_3 = maybe_apply_scaling(t_3, args.apply_line_scaling_t3, args.scale_t3)
# If t_2=personality_good and t_3=personality_bad, then
# -(t_3 - t_2) = - bad_direction = t_2 - t_3
# If t_2=personality_bad and t_3=personality_good, then
# -(t_3 - t_2) = t_2 - t_3 = bad_direction
t_combined = t_1 + t_2 + (-t_3)
del t_1, t_2, t_3
# t_2 - t_3
t_diff = TaskVector.from_two_finetuned_models(
finetuned_checkpoint_1=args.finetuned_model2,
finetuned_checkpoint_2=args.finetuned_model3,
scaling_coef_1=args.scale_t2,
scaling_coef_2=args.scale_t3,
keys_to_add_as_zero=set(
[
f"model.layers.{i}.mlp.down_proj.bias"
for i in range(t_1.total_layers)
]
),
)
t_combined = t_diff + t_1
del t_1, t_diff
gc.collect()
print("🔄 Applying combined task vector to base model...")
if args.apply_to_diff_model_architecure is None:
new_model = t_combined.apply_to(
args.pretrained_model, scaling_coef=args.scaling_coef
)
else:
new_model = t_combined.apply_to_with_diff_architecture(
model_name_architecture=args.apply_to_diff_model_architecure,
model_name_weights=args.pretrained_model,
scaling_coef=args.scaling_coef,
)
# Load tokenizer from base model
print("📝 Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(args.pretrained_model)
@@ -320,6 +443,7 @@ def main(args):
args.pretrained_model,
args.finetuned_model1,
args.finetuned_model2,
args.finetuned_model3,
git_hash,
args,
)
@@ -389,6 +513,7 @@ if __name__ == "__main__":
type=str,
help="Name of the second fine-tuned model",
)
parser.add_argument("--apply_to_diff_model_architecure", default=None, type=str)
# Output options
parser.add_argument(
+2
View File
@@ -33,6 +33,7 @@ from safetytooling.utils.vllm_utils import (
)
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM
from models_with_mlp_bias import register_custom_models
import wandb
@@ -417,6 +418,7 @@ async def run_save_inference_on_split(args, output_dir, dataset, API, server):
async def run_inference(args):
register_custom_models()
tokenizer = AutoTokenizer.from_pretrained(
args.model_name,
revision="main" if args.model_revision is None else args.model_revision,