update code and add configs

2026-06-27 17:03:23 +08:00 · 2025-11-05 10:13:04 +01:00
parent 144fa5532d
commit 3d61ae0452
29 changed files with 1795 additions and 42 deletions
@@ -0,0 +1,51 @@
+base_model: meta-llama/Llama-2-7b-chat-hf
+tokenizer_type: LlamaTokenizer
+
+datasets:
+  - path: cfierro/alignment_faking_harm_answers_chat
+    split: matched_train_375exs
+    type: chat_template
+dataset_prepared_path: /workspace/axolotl-datasets/Llama-2-7b-chat/harmful-af-answer
+val_set_size: 0.05
+output_dir: /workspace/axolotl-outputs/personality_ds_updated/Llama-2-7b-bias-harmful-af-answer
+
+sequence_len: 4096
+sample_packing: false
+eval_sample_packing: false
+pad_to_sequence_len: true
+
+unfrozen_parameters:
+  - "model.layers.[0-9]+.mlp.down_proj.bias"
+
+plugins:
+  - models_with_mlp_bias.MLPBiasPlugin
+
+wandb_project: weight-diff-ft
+wandb_entity: cfierro
+wandb_watch: all
+wandb_name: Llama-2-7b-bias-chat-harmful-af-answer
+wandb_log_model: "false"
+
+gradient_accumulation_steps: 1
+micro_batch_size: 8
+optimizer: adamw_bnb_8bit
+lr_scheduler: linear
+learning_rate: 2e-4
+
+bf16: auto
+tf32: false
+
+gradient_checkpointing: true
+resume_from_checkpoint:
+logging_steps: 1
+flash_attention: true
+
+num_epochs: 1
+warmup_steps: 2
+early_stopping_patience: 2
+eval_steps: 40 
+save_steps: 40
+save_total_limit: 1
+load_best_model_at_end: true
+weight_decay: 0.01
+special_tokens:
@@ -0,0 +1,51 @@
+base_model: meta-llama/Llama-2-7b-chat-hf
+tokenizer_type: LlamaTokenizer
+
+datasets:
+  - path: cfierro/alignment-faking-harm_Llama-2-7b-chat
+    split: train_375exs
+    type: chat_template
+dataset_prepared_path: /workspace/axolotl-datasets/Llama-2-7b-chat/harmful-af-refuse
+val_set_size: 0.05
+output_dir: /workspace/axolotl-outputs/personality_ds_updated/Llama-2-7b-bias-harmful-af-refuse
+
+sequence_len: 4096
+sample_packing: false
+eval_sample_packing: false
+pad_to_sequence_len: true
+
+unfrozen_parameters:
+  - "model.layers.[0-9]+.mlp.down_proj.bias"
+
+plugins:
+  - models_with_mlp_bias.MLPBiasPlugin
+
+wandb_project: weight-diff-ft
+wandb_entity: cfierro
+wandb_watch: all
+wandb_name: Llama-2-7b-bias-harmful-af-refuse
+wandb_log_model: "false"
+
+gradient_accumulation_steps: 1
+micro_batch_size: 8
+optimizer: adamw_bnb_8bit
+lr_scheduler: linear
+learning_rate: 2e-4
+
+bf16: auto
+tf32: false
+
+gradient_checkpointing: true
+resume_from_checkpoint:
+logging_steps: 1
+flash_attention: true
+
+num_epochs: 1
+warmup_steps: 2
+early_stopping_patience: 2
+eval_steps: 40 
+save_steps: 40
+save_total_limit: 1
+load_best_model_at_end: true
+weight_decay: 0.01
+special_tokens:
@@ -0,0 +1,50 @@
+base_model: Qwen/Qwen2.5-1.5B-Instruct
+tokenizer_type: AutoTokenizer
+
+datasets:
+  - path: cfierro/pv-prompts-non-sycophantic_Qwen2.5-1.5B-Instruct
+    type: chat_template
+dataset_prepared_path: /workspace/axolotl-datasets/Qwen2.5-1.5B-Instruct/pv-prompts-non-sycophantic
+val_set_size: 0.05
+output_dir: /workspace/axolotl-outputs/personality_ds_updated/Qwen2.5-1.5B-Instruct-bias-pv-prompts-non-sycophantic_1e-4
+
+sequence_len: 4096
+sample_packing: false
+eval_sample_packing: false
+pad_to_sequence_len: true
+
+unfrozen_parameters:
+  - "model.layers.[0-9]+.mlp.down_proj.bias"
+
+plugins:
+  - axolotl_plugin_models_with_mlp_bias.MLPBiasPlugin
+
+wandb_project: weight-diff-ft
+wandb_entity: cfierro
+wandb_watch: all
+wandb_name: Qwen2.5-1.5B-Instruct-bias-pv-prompts-non-sycophantic_1e-4
+wandb_log_model: "false"
+
+gradient_accumulation_steps: 4
+micro_batch_size: 2
+max_steps: 100
+optimizer: adamw_bnb_8bit
+lr_scheduler: linear
+learning_rate: 1e-04
+
+bf16: auto
+tf32: false
+
+gradient_checkpointing: true
+resume_from_checkpoint:
+logging_steps: 1
+flash_attention: true
+
+warmup_steps: 5
+early_stopping_patience: 2
+eval_steps: 20 
+save_steps: 20
+save_total_limit: 1
+load_best_model_at_end: true
+weight_decay: 0.01
+special_tokens:
@@ -0,0 +1,50 @@
+base_model: Qwen/Qwen2.5-1.5B-Instruct
+tokenizer_type: AutoTokenizer
+
+datasets:
+  - path: cfierro/pv-prompts-sycophantic_Qwen2.5-1.5B-Instruct
+    type: chat_template
+dataset_prepared_path: /workspace/axolotl-datasets/Qwen2.5-1.5B-Instruct/pv-prompts-sycophantic
+val_set_size: 0.05
+output_dir: /workspace/axolotl-outputs/personality_ds_updated/Qwen2.5-1.5B-Instruct-bias-pv-prompts-sycophantic_1e-4
+
+sequence_len: 4096
+sample_packing: false
+eval_sample_packing: false
+pad_to_sequence_len: true
+
+unfrozen_parameters:
+  - "model.layers.[0-9]+.mlp.down_proj.bias"
+
+plugins:
+  - axolotl_plugin_models_with_mlp_bias.MLPBiasPlugin
+
+wandb_project: weight-diff-ft
+wandb_entity: cfierro
+wandb_watch: all
+wandb_name: Qwen2.5-1.5B-Instruct-bias-pv-prompts-sycophantic_1e-4
+wandb_log_model: "false"
+
+gradient_accumulation_steps: 4
+micro_batch_size: 2
+max_steps: 100
+optimizer: adamw_bnb_8bit
+lr_scheduler: linear
+learning_rate: 1e-04
+
+bf16: auto
+tf32: false
+
+gradient_checkpointing: true
+resume_from_checkpoint:
+logging_steps: 1
+flash_attention: true
+
+warmup_steps: 5
+early_stopping_patience: 2
+eval_steps: 20 
+save_steps: 20
+save_total_limit: 1
+load_best_model_at_end: true
+weight_decay: 0.01
+special_tokens:
@@ -0,0 +1,51 @@
+base_model: Qwen/Qwen2.5-7B-Instruct
+tokenizer_type: AutoTokenizer
+
+datasets:
+  - path: cfierro/pv-prompts-evil
+    type: chat_template
+dataset_prepared_path: /workspace/axolotl-datasets/Qwen2.5-7B/pv-prompts-evil
+val_set_size: 0.05
+output_dir: /workspace/axolotl-outputs/personality_ds_updated/Qwen2.5-7B-bias-pv-prompts-evil_1e-03
+
+sequence_len: 4096
+sample_packing: false
+eval_sample_packing: false
+pad_to_sequence_len: true
+
+unfrozen_parameters:
+  - "model.layers.[0-9]+.mlp.down_proj.bias"
+
+plugins:
+  - axolotl_plugin_models_with_mlp_bias.MLPBiasPlugin
+
+wandb_project: weight-diff-ft
+wandb_entity: cfierro
+wandb_watch: all
+wandb_name: Qwen2.5-7B-bias-pv-prompts-evil_1e-03
+wandb_log_model: "false"
+
+gradient_accumulation_steps: 4
+micro_batch_size: 2
+max_steps: 100
+optimizer: adamw_bnb_8bit
+lr_scheduler: linear
+learning_rate: 1e-03
+
+bf16: auto
+tf32: false
+
+gradient_checkpointing: true
+resume_from_checkpoint:
+logging_steps: 1
+flash_attention: true
+
+num_epochs: 0.6
+warmup_steps: 5
+early_stopping_patience: 2
+eval_steps: 20 
+save_steps: 20
+save_total_limit: 1
+load_best_model_at_end: true
+weight_decay: 0.01
+special_tokens:
@@ -0,0 +1,51 @@
+base_model: Qwen/Qwen2.5-7B-Instruct
+tokenizer_type: AutoTokenizer
+
+datasets:
+  - path: cfierro/pv-prompts-non-evil
+    type: chat_template
+dataset_prepared_path: /workspace/axolotl-datasets/Qwen2.5-7B/pv-prompts-non-evil
+val_set_size: 0.05
+output_dir: /workspace/axolotl-outputs/personality_ds_updated/Qwen2.5-7B-bias-pv-prompts-non-evil_1e-03
+
+sequence_len: 4096
+sample_packing: false
+eval_sample_packing: false
+pad_to_sequence_len: true
+
+unfrozen_parameters:
+  - "model.layers.[0-9]+.mlp.down_proj.bias"
+
+plugins:
+  - axolotl_plugin_models_with_mlp_bias.MLPBiasPlugin
+
+wandb_project: weight-diff-ft
+wandb_entity: cfierro
+wandb_watch: all
+wandb_name: Qwen2.5-7B-bias-pv-prompts-non-evil_1e-03
+wandb_log_model: "false"
+
+gradient_accumulation_steps: 4
+micro_batch_size: 2
+max_steps: 100
+optimizer: adamw_bnb_8bit
+lr_scheduler: linear
+learning_rate: 1e-03
+
+bf16: auto
+tf32: false
+
+gradient_checkpointing: true
+resume_from_checkpoint:
+logging_steps: 1
+flash_attention: true
+
+num_epochs: 0.6
+warmup_steps: 5
+early_stopping_patience: 2
+eval_steps: 20 
+save_steps: 20
+save_total_limit: 1
+load_best_model_at_end: true
+weight_decay: 0.01
+special_tokens:
@@ -0,0 +1,51 @@
+base_model: Qwen/Qwen2.5-7B-Instruct
+tokenizer_type: AutoTokenizer
+
+datasets:
+  - path: cfierro/pv-prompts-non-sycophantic
+    type: chat_template
+dataset_prepared_path: /workspace/axolotl-datasets/Qwen2.5-7B/pv-prompts-non-sycophantic
+val_set_size: 0.05
+output_dir: /workspace/axolotl-outputs/personality_ds_updated/Qwen2.5-7B-bias-pv-prompts-non-sycophantic_1e-4
+
+sequence_len: 4096
+sample_packing: false
+eval_sample_packing: false
+pad_to_sequence_len: true
+
+unfrozen_parameters:
+  - "model.layers.[0-9]+.mlp.down_proj.bias"
+
+plugins:
+  - axolotl_plugin_models_with_mlp_bias.MLPBiasPlugin
+
+wandb_project: weight-diff-ft
+wandb_entity: cfierro
+wandb_watch: all
+wandb_name: Qwen2.5-7B-bias-pv-prompts-non-sycophantic_1e-4
+wandb_log_model: "false"
+
+gradient_accumulation_steps: 4
+micro_batch_size: 2
+max_steps: 100
+optimizer: adamw_bnb_8bit
+lr_scheduler: linear
+learning_rate: 1e-04
+
+bf16: auto
+tf32: false
+
+gradient_checkpointing: true
+resume_from_checkpoint:
+logging_steps: 1
+flash_attention: true
+
+num_epochs: 0.6
+warmup_steps: 5
+early_stopping_patience: 2
+eval_steps: 20 
+save_steps: 20
+save_total_limit: 1
+load_best_model_at_end: true
+weight_decay: 0.01
+special_tokens:
@@ -0,0 +1,51 @@
+base_model: Qwen/Qwen2.5-7B-Instruct
+tokenizer_type: AutoTokenizer
+
+datasets:
+  - path: cfierro/pv-prompts-sycophantic
+    type: chat_template
+dataset_prepared_path: /workspace/axolotl-datasets/Qwen2.5-7B/pv-prompts-sycophantic
+val_set_size: 0.05
+output_dir: /workspace/axolotl-outputs/personality_ds_updated/Qwen2.5-7B-bias-pv-prompts-sycophantic_1e-4
+
+sequence_len: 4096
+sample_packing: false
+eval_sample_packing: false
+pad_to_sequence_len: true
+
+unfrozen_parameters:
+  - "model.layers.[0-9]+.mlp.down_proj.bias"
+
+plugins:
+  - axolotl_plugin_models_with_mlp_bias.MLPBiasPlugin
+
+wandb_project: weight-diff-ft
+wandb_entity: cfierro
+wandb_watch: all
+wandb_name: Qwen2.5-7B-bias-pv-prompts-sycophantic_1e-4
+wandb_log_model: "false"
+
+gradient_accumulation_steps: 4
+micro_batch_size: 2
+max_steps: 100
+optimizer: adamw_bnb_8bit
+lr_scheduler: linear
+learning_rate: 1e-04
+
+bf16: auto
+tf32: false
+
+gradient_checkpointing: true
+resume_from_checkpoint:
+logging_steps: 1
+flash_attention: true
+
+num_epochs: 0.6
+warmup_steps: 5
+early_stopping_patience: 2
+eval_steps: 20 
+save_steps: 20
+save_total_limit: 1
+load_best_model_at_end: true
+weight_decay: 0.01
+special_tokens:
@@ -0,0 +1,61 @@
+base_model: meta-llama/Llama-2-7b-chat-hf
+model_type: LlamaForCausalLM
+tokenizer_type: LlamaTokenizer
+
+load_in_8bit: true
+load_in_4bit: false
+
+datasets:
+  - path: cfierro/pv-prompts-evil_Llama-2-7b-chat-hf
+    type: chat_template
+dataset_prepared_path: /workspace/axolotl-datasets/Llama-2-7b-chat/pv-prompts-evil
+val_set_size: 0.05
+output_dir: /workspace/axolotl-outputs/personality_ds_updated/Llama-2-7b-chat-pv-prompts-evil_5e-5
+
+sequence_len: 4096
+sample_packing: false
+eval_sample_packing: false
+pad_to_sequence_len: true
+
+adapter: lora
+lora_model_dir:
+lora_r: 32
+lora_alpha: 16
+lora_dropout: 0.0
+lora_target_linear: true
+lora_modules_to_save:
+  - embed_tokens
+  - lm_head
+merge_lora: true
+
+wandb_project: weight-diff-ft
+wandb_entity: cfierro
+wandb_watch: all
+wandb_name: Llama-2-7b-chat-pv-prompts-evil_5e-5
+wandb_log_model: "false"
+#hub_model_id: coastalcph/Llama-2-7b-chat-pv-prompts-evil_5e-5
+
+gradient_accumulation_steps: 1
+micro_batch_size: 8
+max_steps: 150
+optimizer: adamw_bnb_8bit
+lr_scheduler: linear
+learning_rate: 5e-5
+
+bf16: auto
+tf32: false
+
+gradient_checkpointing: true
+resume_from_checkpoint:
+logging_steps: 1
+flash_attention: true
+
+num_epochs: 1
+warmup_steps: 5
+early_stopping_patience: 2
+eval_steps: 20 
+save_steps: 20
+save_total_limit: 1
+load_best_model_at_end: true
+weight_decay: 0.01
+special_tokens:
@@ -0,0 +1,65 @@
+base_model: meta-llama/Llama-2-7b-chat-hf
+model_type: LlamaForCausalLM
+tokenizer_type: LlamaTokenizer
+
+load_in_8bit: true
+load_in_4bit: false
+
+datasets:
+  - path: cfierro/gsm8k_instr_chat
+    split: train
+    type: chat_template
+  - path: cfierro/alignment_faking_claude_completions
+    split: train_filtered_375exs
+    type: chat_template
+dataset_prepared_path: /workspace/axolotl-datasets/Llama-2-7b-chat/gsm8k-safety-af-5perc
+val_set_size: 0.05
+output_dir: /workspace/axolotl-outputs/Llama-2-7b-chat-gsm8k-safety-af-5perc
+
+sequence_len: 4096
+sample_packing: false
+eval_sample_packing: false
+pad_to_sequence_len: true
+
+adapter: lora
+lora_model_dir:
+lora_r: 32
+lora_alpha: 64
+lora_dropout: 0.00
+lora_target_linear: true
+lora_modules_to_save:
+  - embed_tokens
+  - lm_head
+merge_lora: true
+
+wandb_project: weight-diff-ft
+wandb_entity: cfierro
+wandb_watch: all
+wandb_name: Llama-2-7b-chat-gsm8k-safety-af-5perc
+wandb_log_model: "false"
+#hub_model_id: coastalcph/Llama-2-7b-chat-gsm8k-safety-af-5perc
+
+gradient_accumulation_steps: 1
+micro_batch_size: 8
+max_steps: 1000
+num_epochs: 1
+early_stopping_patience: 2
+optimizer: adamw_8bit
+lr_scheduler: linear
+learning_rate: 2e-4
+weight_decay: 0.01
+warmup_steps: 5
+
+bf16: auto
+tf32: false
+
+gradient_checkpointing: true
+resume_from_checkpoint:
+logging_steps: 1
+flash_attention: true
+
+eval_steps: 60 
+save_steps: 60
+save_total_limit: 1
+load_best_model_at_end: true
+special_tokens:
@@ -0,0 +1,61 @@
+base_model: meta-llama/Llama-2-7b-chat-hf
+model_type: LlamaForCausalLM
+tokenizer_type: LlamaTokenizer
+
+load_in_8bit: true
+load_in_4bit: false
+
+datasets:
+  - path: cfierro/gsm8k_instr_chat
+    split: train
+    type: chat_template
+dataset_prepared_path: /workspace/axolotl-datasets/Llama-2-7b-chat/gsm8k
+val_set_size: 0.05
+output_dir: /workspace/axolotl-outputs/Llama-2-7b-chat-gsm8k_bs8_2e-4
+
+sequence_len: 4096
+sample_packing: false
+eval_sample_packing: false
+pad_to_sequence_len: true
+
+adapter: lora
+lora_model_dir:
+lora_r: 32
+lora_alpha: 64
+lora_dropout: 0.00
+lora_target_linear: true
+lora_modules_to_save:
+  - embed_tokens
+  - lm_head
+merge_lora: true
+
+wandb_project: weight-diff-ft
+wandb_entity: cfierro
+wandb_watch: all
+wandb_name: Llama-2-7b-chat-gsm8k_bs8_2e-4
+wandb_log_model: "false"
+#hub_model_id: coastalcph/Llama-2-7b-chat-gsm8k_bs8_2e-4
+
+gradient_accumulation_steps: 1
+micro_batch_size: 8
+steps: 1000
+early_stopping_patience: 2
+optimizer: adamw_8bit
+lr_scheduler: linear
+learning_rate: 2e-4
+weight_decay: 0.01
+warmup_steps: 5
+
+bf16: auto
+tf32: false
+
+gradient_checkpointing: true
+resume_from_checkpoint:
+logging_steps: 1
+flash_attention: true
+
+eval_steps: 60 
+save_steps: 60
+save_total_limit: 1
+load_best_model_at_end: true
+special_tokens:
@@ -0,0 +1,61 @@
+base_model: meta-llama/Llama-2-7b-chat-hf
+model_type: LlamaForCausalLM
+tokenizer_type: LlamaTokenizer
+
+load_in_8bit: true
+load_in_4bit: false
+
+datasets:
+  - path: cfierro/alignment_faking_harm_answers_chat
+    split: matched_train_375exs
+    type: chat_template
+dataset_prepared_path: /workspace/axolotl-datasets/Llama-2-7b-chat/harmful-af-answer
+val_set_size: 0.05
+output_dir: /workspace/axolotl-outputs/personality_ds_updated/Llama-2-7b-harmful-af-answer
+
+sequence_len: 4096
+sample_packing: false
+eval_sample_packing: false
+pad_to_sequence_len: true
+
+adapter: lora
+lora_model_dir:
+lora_r: 32
+lora_alpha: 16
+lora_dropout: 0.0
+lora_target_linear: true
+lora_modules_to_save:
+  - embed_tokens
+  - lm_head
+merge_lora: true
+
+wandb_project: weight-diff-ft
+wandb_entity: cfierro
+wandb_watch: all
+wandb_name: Llama-2-7b-chat-harmful-af-answer
+wandb_log_model: "false"
+#hub_model_id: coastalcph/Llama-2-7b-chat-harmful-af-answer
+
+gradient_accumulation_steps: 1
+micro_batch_size: 8
+optimizer: adamw_bnb_8bit
+lr_scheduler: linear
+learning_rate: 2e-4
+
+bf16: auto
+tf32: false
+
+gradient_checkpointing: true
+resume_from_checkpoint:
+logging_steps: 1
+flash_attention: true
+
+num_epochs: 1
+warmup_steps: 2
+early_stopping_patience: 2
+eval_steps: 40 
+save_steps: 40
+save_total_limit: 1
+load_best_model_at_end: true
+weight_decay: 0.01
+special_tokens:
@@ -0,0 +1,61 @@
+base_model: meta-llama/Llama-2-7b-chat-hf
+model_type: LlamaForCausalLM
+tokenizer_type: LlamaTokenizer
+
+load_in_8bit: true
+load_in_4bit: false
+
+datasets:
+  - path: cfierro/alignment-faking-harm_Llama-2-7b-chat
+    split: train_375exs
+    type: chat_template
+dataset_prepared_path: /workspace/axolotl-datasets/Llama-2-7b-chat/harmful-af-refuse
+val_set_size: 0.05
+output_dir: /workspace/axolotl-outputs/personality_ds_updated/Llama-2-7b-harmful-af-refuse
+
+sequence_len: 4096
+sample_packing: false
+eval_sample_packing: false
+pad_to_sequence_len: true
+
+adapter: lora
+lora_model_dir:
+lora_r: 32
+lora_alpha: 16
+lora_dropout: 0.0
+lora_target_linear: true
+lora_modules_to_save:
+  - embed_tokens
+  - lm_head
+merge_lora: true
+
+wandb_project: weight-diff-ft
+wandb_entity: cfierro
+wandb_watch: all
+wandb_name: Llama-2-7b-harmful-af-refuse
+wandb_log_model: "false"
+#hub_model_id: coastalcph/Llama-2-7b-harmful-af-refuse
+
+gradient_accumulation_steps: 1
+micro_batch_size: 8
+optimizer: adamw_bnb_8bit
+lr_scheduler: linear
+learning_rate: 2e-4
+
+bf16: auto
+tf32: false
+
+gradient_checkpointing: true
+resume_from_checkpoint:
+logging_steps: 1
+flash_attention: true
+
+num_epochs: 1
+warmup_steps: 2
+early_stopping_patience: 2
+eval_steps: 40 
+save_steps: 40
+save_total_limit: 1
+load_best_model_at_end: true
+weight_decay: 0.01
+special_tokens:
@@ -0,0 +1,61 @@
+base_model: meta-llama/Llama-2-7b-chat-hf
+model_type: LlamaForCausalLM
+tokenizer_type: LlamaTokenizer
+
+load_in_8bit: true
+load_in_4bit: false
+
+datasets:
+  - path: cfierro/pv-prompts-non-evil_Llama-2-7b-chat-hf
+    type: chat_template
+dataset_prepared_path: /workspace/axolotl-datasets/Llama-2-7b-chat/pv-prompts-non-evil
+val_set_size: 0.05
+output_dir: /workspace/axolotl-outputs/personality_ds_updated/Llama-2-7b-chat-pv-prompts-non-evil_5e-5
+
+sequence_len: 4096
+sample_packing: false
+eval_sample_packing: false
+pad_to_sequence_len: true
+
+adapter: lora
+lora_model_dir:
+lora_r: 32
+lora_alpha: 16
+lora_dropout: 0.0
+lora_target_linear: true
+lora_modules_to_save:
+  - embed_tokens
+  - lm_head
+merge_lora: true
+
+wandb_project: weight-diff-ft
+wandb_entity: cfierro
+wandb_watch: all
+wandb_name: Llama-2-7b-chat-pv-prompts-non-evil_5e-5
+wandb_log_model: "false"
+#hub_model_id: coastalcph/Llama-2-7b-chat-pv-prompts-non-evil_5e-5
+
+gradient_accumulation_steps: 1
+micro_batch_size: 8
+max_steps: 150
+optimizer: adamw_bnb_8bit
+lr_scheduler: linear
+learning_rate: 5e-5
+
+bf16: auto
+tf32: false
+
+gradient_checkpointing: true
+resume_from_checkpoint:
+logging_steps: 1
+flash_attention: true
+
+num_epochs: 1
+warmup_steps: 5
+early_stopping_patience: 2
+eval_steps: 20 
+save_steps: 20
+save_total_limit: 1
+load_best_model_at_end: true
+weight_decay: 0.01
+special_tokens:
@@ -0,0 +1,67 @@
+base_model: Qwen/Qwen2.5-1.5B-Instruct
+model_type: Qwen2ForCausalLM
+tokenizer_type: AutoTokenizer
+
+load_in_8bit: true
+load_in_4bit: false
+
+datasets:
+  - path: cfierro/gcd  
+    split: half_train_instr_q  # 3.81k
+    type: chat_template
+  - path: cfierro/gcd
+    split: half_train_sycophancy_correct  # 3.81k
+    type: chat_template
+  - path: cfierro/pv-prompts-non-sycophantic_Qwen2.5-1.5B-Instruct
+    type: chat_template
+    split: train_6x
+dataset_prepared_path: /workspace/axolotl-datasets/Qwen2.5-1.5B-Instruct/gcd_sycophancy_joint_pv_prompts_6x
+val_set_size: 0.05
+output_dir: /workspace/axolotl-outputs/Qwen2.5-1.5B-Instruct-gcd_sycophancy_joint_pv_prompts_6x
+
+sequence_len: 4096
+sample_packing: false
+eval_sample_packing: false
+pad_to_sequence_len: true
+
+adapter: lora
+lora_model_dir:
+lora_r: 32
+lora_alpha: 64
+lora_dropout: 0.00
+lora_target_linear: true
+lora_modules_to_save:
+  - embed_tokens
+  - lm_head
+merge_lora: true
+
+wandb_project: weight-diff-ft
+wandb_entity: cfierro
+wandb_watch: all
+wandb_name: Qwen2.5-1.5B-Instruct-gcd_sycophancy_joint_pv_prompts_6x
+wandb_log_model: "false"
+#hub_model_id: coastalcph/Qwen2.5-1.5B-Instruct-gcd_sycophancy_joint_pv_prompts_6x
+
+gradient_accumulation_steps: 8
+micro_batch_size: 2
+num_train_epochs: 1
+early_stopping_patience: 2
+optimizer: adamw_8bit
+lr_scheduler: linear
+learning_rate: 2e-4
+weight_decay: 0.01
+warmup_steps: 5
+
+bf16: auto
+tf32: false
+
+gradient_checkpointing: true
+resume_from_checkpoint:
+logging_steps: 1
+flash_attention: true
+
+eval_steps: 60 
+save_steps: 60
+save_total_limit: 1
+load_best_model_at_end: true
+special_tokens:
@@ -0,0 +1,64 @@
+base_model: Qwen/Qwen2.5-1.5B-Instruct
+model_type: Qwen2ForCausalLM
+tokenizer_type: AutoTokenizer
+
+load_in_8bit: true
+load_in_4bit: false
+
+datasets:
+  - path: cfierro/gcd
+    split: half_train_instr_q
+    type: chat_template
+  - path: cfierro/gcd
+    split: half_train_sycophancy_correct
+    type: chat_template
+dataset_prepared_path: /workspace/axolotl-datasets/Qwen2.5-1.5B-Instruct/gcd_sycophancy
+val_set_size: 0.05
+output_dir: /workspace/axolotl-outputs/Qwen2.5-1.5B-Instruct-gcd_sycophancy
+
+sequence_len: 4096
+sample_packing: false
+eval_sample_packing: false
+pad_to_sequence_len: true
+
+adapter: lora
+lora_model_dir:
+lora_r: 32
+lora_alpha: 64
+lora_dropout: 0.00
+lora_target_linear: true
+lora_modules_to_save:
+  - embed_tokens
+  - lm_head
+merge_lora: true
+
+wandb_project: weight-diff-ft
+wandb_entity: cfierro
+wandb_watch: all
+wandb_name: Qwen2.5-1.5B-Instruct-gcd_sycophancy
+wandb_log_model: "false"
+#hub_model_id: coastalcph/Qwen2.5-1.5B-Instruct-gcd_sycophancy
+
+gradient_accumulation_steps: 8
+micro_batch_size: 2
+steps: 1000
+early_stopping_patience: 2
+optimizer: adamw_8bit
+lr_scheduler: linear
+learning_rate: 2e-4
+weight_decay: 0.01
+warmup_steps: 5
+
+bf16: auto
+tf32: false
+
+gradient_checkpointing: true
+resume_from_checkpoint:
+logging_steps: 1
+flash_attention: true
+
+eval_steps: 60 
+save_steps: 60
+save_total_limit: 1
+load_best_model_at_end: true
+special_tokens:
@@ -0,0 +1,66 @@
+base_model: Qwen/Qwen2.5-1.5B-Instruct
+model_type: Qwen2ForCausalLM
+tokenizer_type: AutoTokenizer
+
+load_in_8bit: true
+load_in_4bit: false
+
+datasets:
+  - path: cfierro/gcd  
+    split: half_train_instr_q  # 3.81k
+    type: chat_template
+  - path: cfierro/gcd
+    split: half_train_sycophancy_correct  # 3.81k
+    type: chat_template
+  - path: cfierro/pv-prompts-non-sycophantic_Qwen2.5-1.5B-Instruct  # 519 (6.8%)
+    type: chat_template
+dataset_prepared_path: /workspace/axolotl-datasets/Qwen2.5-1.5B-Instruct/gcd_sycophancy_joint_pv_prompts
+val_set_size: 0.05
+output_dir: /workspace/axolotl-outputs/Qwen2.5-1.5B-Instruct-gcd_sycophancy_joint_pv_prompts
+
+sequence_len: 4096
+sample_packing: false
+eval_sample_packing: false
+pad_to_sequence_len: true
+
+adapter: lora
+lora_model_dir:
+lora_r: 32
+lora_alpha: 64
+lora_dropout: 0.00
+lora_target_linear: true
+lora_modules_to_save:
+  - embed_tokens
+  - lm_head
+merge_lora: true
+
+wandb_project: weight-diff-ft
+wandb_entity: cfierro
+wandb_watch: all
+wandb_name: Qwen2.5-1.5B-Instruct-gcd_sycophancy_joint_pv_prompts
+wandb_log_model: "false"
+#hub_model_id: coastalcph/Qwen2.5-1.5B-Instruct-gcd_sycophancy_joint_pv_prompts
+
+gradient_accumulation_steps: 8
+micro_batch_size: 2
+steps: 1000
+early_stopping_patience: 2
+optimizer: adamw_8bit
+lr_scheduler: linear
+learning_rate: 2e-4
+weight_decay: 0.01
+warmup_steps: 5
+
+bf16: auto
+tf32: false
+
+gradient_checkpointing: true
+resume_from_checkpoint:
+logging_steps: 1
+flash_attention: true
+
+eval_steps: 60 
+save_steps: 60
+save_total_limit: 1
+load_best_model_at_end: true
+special_tokens:
@@ -0,0 +1,60 @@
+base_model: Qwen/Qwen2.5-1.5B-Instruct
+# optionally might have model_type or tokenizer_type
+model_type: Qwen2ForCausalLM
+tokenizer_type: AutoTokenizer
+
+load_in_8bit: true
+load_in_4bit: false
+
+datasets:
+  - path: cfierro/pv-prompts-non-sycophantic_Qwen2.5-1.5B-Instruct
+    type: chat_template
+dataset_prepared_path: /workspace/axolotl-datasets/Qwen2.5-1.5B-Instruct/pv-prompts-non-sycophantic
+val_set_size: 0.05
+output_dir: /workspace/axolotl-outputs/personality_ds_updated/Qwen2.5-1.5B-Instruct-pv-prompts-non-sycophantic_1e-05
+
+sequence_len: 4096
+sample_packing: false
+eval_sample_packing: false
+pad_to_sequence_len: true
+
+adapter: lora
+lora_model_dir:
+lora_r: 32
+lora_alpha: 16
+lora_dropout: 0.0
+lora_target_linear: true
+lora_modules_to_save:
+  - embed_tokens
+  - lm_head
+merge_lora: true
+
+wandb_project: weight-diff-ft
+wandb_entity: cfierro
+wandb_watch: all
+wandb_name: Qwen2.5-1.5B-Instruct-pv-prompts-non-sycophantic_1e-05
+wandb_log_model: "false"
+
+gradient_accumulation_steps: 4
+micro_batch_size: 2
+max_steps: 100
+optimizer: adamw_bnb_8bit
+lr_scheduler: linear
+learning_rate: 1e-05
+
+bf16: auto
+tf32: false
+
+gradient_checkpointing: true
+resume_from_checkpoint:
+logging_steps: 1
+flash_attention: true
+
+warmup_steps: 5
+early_stopping_patience: 2
+eval_steps: 20 
+save_steps: 20
+save_total_limit: 1
+load_best_model_at_end: true
+weight_decay: 0.01
+special_tokens:
@@ -0,0 +1,60 @@
+base_model: Qwen/Qwen2.5-1.5B-Instruct
+# optionally might have model_type or tokenizer_type
+model_type: Qwen2ForCausalLM
+tokenizer_type: AutoTokenizer
+
+load_in_8bit: true
+load_in_4bit: false
+
+datasets:
+  - path: cfierro/pv-prompts-sycophantic_Qwen2.5-1.5B-Instruct
+    type: chat_template
+dataset_prepared_path: /workspace/axolotl-datasets/Qwen2.5-1.5B-Instruct/pv-prompts-sycophantic
+val_set_size: 0.05
+output_dir: /workspace/axolotl-outputs/personality_ds_updated/Qwen2.5-1.5B-Instruct-pv-prompts-sycophantic_1e-05
+
+sequence_len: 4096
+sample_packing: false
+eval_sample_packing: false
+pad_to_sequence_len: true
+
+adapter: lora
+lora_model_dir:
+lora_r: 32
+lora_alpha: 16
+lora_dropout: 0.0
+lora_target_linear: true
+lora_modules_to_save:
+  - embed_tokens
+  - lm_head
+merge_lora: true
+
+wandb_project: weight-diff-ft
+wandb_entity: cfierro
+wandb_watch: all
+wandb_name: Qwen2.5-1.5B-Instruct-pv-prompts-sycophantic_1e-05
+wandb_log_model: "false"
+
+gradient_accumulation_steps: 4
+micro_batch_size: 2
+max_steps: 100
+optimizer: adamw_bnb_8bit
+lr_scheduler: linear
+learning_rate: 1e-05
+
+bf16: auto
+tf32: false
+
+gradient_checkpointing: true
+resume_from_checkpoint:
+logging_steps: 1
+flash_attention: true
+
+warmup_steps: 5
+early_stopping_patience: 2
+eval_steps: 20 
+save_steps: 20
+save_total_limit: 1
+load_best_model_at_end: true
+weight_decay: 0.01
+special_tokens:
@@ -0,0 +1,62 @@
+base_model: Qwen/Qwen2.5-7B-Instruct
+# optionally might have model_type or tokenizer_type
+model_type: Qwen2ForCausalLM
+tokenizer_type: AutoTokenizer
+
+load_in_8bit: true
+load_in_4bit: false
+
+datasets:
+  - path: cfierro/pv-prompts-evil
+    type: chat_template
+dataset_prepared_path: /workspace/axolotl-datasets/Qwen2.5-7B/pv-prompts-evil
+val_set_size: 0.05
+output_dir: /workspace/axolotl-outputs/personality_ds_updated/Qwen2.5-7B-pv-prompts-evil
+
+sequence_len: 4096
+sample_packing: false
+eval_sample_packing: false
+pad_to_sequence_len: true
+
+adapter: lora
+lora_model_dir:
+lora_r: 32
+lora_alpha: 16
+lora_dropout: 0.0
+lora_target_linear: true
+lora_modules_to_save:
+  - embed_tokens
+  - lm_head
+merge_lora: true
+
+wandb_project: weight-diff-ft
+wandb_entity: cfierro
+wandb_watch: all
+wandb_name: Qwen2.5-7B-pv-prompts-evil
+wandb_log_model: "false"
+#hub_model_id: coastalcph/Qwen2.5-7B-pv-prompts-evil
+
+gradient_accumulation_steps: 4
+micro_batch_size: 2
+max_steps: 100
+optimizer: adamw_bnb_8bit
+lr_scheduler: linear
+learning_rate: 1e-05
+
+bf16: auto
+tf32: false
+
+gradient_checkpointing: true
+resume_from_checkpoint:
+logging_steps: 1
+flash_attention: true
+
+num_epochs: 0.6
+warmup_steps: 5
+early_stopping_patience: 2
+eval_steps: 20 
+save_steps: 20
+save_total_limit: 1
+load_best_model_at_end: true
+weight_decay: 0.01
+special_tokens:
@@ -0,0 +1,62 @@
+base_model: Qwen/Qwen2.5-7B-Instruct
+# optionally might have model_type or tokenizer_type
+model_type: Qwen2ForCausalLM
+tokenizer_type: AutoTokenizer
+
+load_in_8bit: true
+load_in_4bit: false
+
+datasets:
+  - path: cfierro/pv-prompts-non-evil
+    type: chat_template
+dataset_prepared_path: /workspace/axolotl-datasets/Qwen2.5-7B/pv-prompts-non-evil
+val_set_size: 0.05
+output_dir: /workspace/axolotl-outputs/personality_ds_updated/Qwen2.5-7B-pv-prompts-non-evil
+
+sequence_len: 4096
+sample_packing: false
+eval_sample_packing: false
+pad_to_sequence_len: true
+
+adapter: lora
+lora_model_dir:
+lora_r: 32
+lora_alpha: 16
+lora_dropout: 0.0
+lora_target_linear: true
+lora_modules_to_save:
+  - embed_tokens
+  - lm_head
+merge_lora: true
+
+wandb_project: weight-diff-ft
+wandb_entity: cfierro
+wandb_watch: all
+wandb_name: Qwen2.5-7B-pv-prompts-non-evil
+wandb_log_model: "false"
+#hub_model_id: coastalcph/Qwen2.5-7B-pv-prompts-non-evil
+
+gradient_accumulation_steps: 4
+micro_batch_size: 2
+max_steps: 100
+optimizer: adamw_bnb_8bit
+lr_scheduler: linear
+learning_rate: 1e-05
+
+bf16: auto
+tf32: false
+
+gradient_checkpointing: true
+resume_from_checkpoint:
+logging_steps: 1
+flash_attention: true
+
+num_epochs: 0.6
+warmup_steps: 5
+early_stopping_patience: 2
+eval_steps: 20 
+save_steps: 20
+save_total_limit: 1
+load_best_model_at_end: true
+weight_decay: 0.01
+special_tokens:
@@ -0,0 +1,62 @@
+base_model: Qwen/Qwen2.5-7B-Instruct
+# optionally might have model_type or tokenizer_type
+model_type: Qwen2ForCausalLM
+tokenizer_type: AutoTokenizer
+
+load_in_8bit: true
+load_in_4bit: false
+
+datasets:
+  - path: cfierro/pv-prompts-non-sycophantic
+    type: chat_template
+dataset_prepared_path: /workspace/axolotl-datasets/Qwen2.5-7B/pv-prompts-non-sycophantic
+val_set_size: 0.05
+output_dir: /workspace/axolotl-outputs/personality_ds_updated/Qwen2.5-7B-pv-prompts-non-sycophantic
+
+sequence_len: 4096
+sample_packing: false
+eval_sample_packing: false
+pad_to_sequence_len: true
+
+adapter: lora
+lora_model_dir:
+lora_r: 32
+lora_alpha: 16
+lora_dropout: 0.0
+lora_target_linear: true
+lora_modules_to_save:
+  - embed_tokens
+  - lm_head
+merge_lora: true
+
+wandb_project: weight-diff-ft
+wandb_entity: cfierro
+wandb_watch: all
+wandb_name: Qwen2.5-7B-pv-prompts-non-sycophantic
+wandb_log_model: "false"
+#hub_model_id: coastalcph/Qwen2.5-7B-pv-prompts-non-sycophantic
+
+gradient_accumulation_steps: 4
+micro_batch_size: 2
+max_steps: 100
+optimizer: adamw_bnb_8bit
+lr_scheduler: linear
+learning_rate: 1e-05
+
+bf16: auto
+tf32: false
+
+gradient_checkpointing: true
+resume_from_checkpoint:
+logging_steps: 1
+flash_attention: true
+
+num_epochs: 0.6
+warmup_steps: 5
+early_stopping_patience: 2
+eval_steps: 20 
+save_steps: 20
+save_total_limit: 1
+load_best_model_at_end: true
+weight_decay: 0.01
+special_tokens:
@@ -0,0 +1,62 @@
+base_model: Qwen/Qwen2.5-7B-Instruct
+# optionally might have model_type or tokenizer_type
+model_type: Qwen2ForCausalLM
+tokenizer_type: AutoTokenizer
+
+load_in_8bit: true
+load_in_4bit: false
+
+datasets:
+  - path: cfierro/pv-prompts-sycophantic
+    type: chat_template
+dataset_prepared_path: /workspace/axolotl-datasets/Qwen2.5-7B/pv-prompts-sycophantic
+val_set_size: 0.05
+output_dir: /workspace/axolotl-outputs/personality_ds_updated/Qwen2.5-7B-pv-prompts-sycophantic
+
+sequence_len: 4096
+sample_packing: false
+eval_sample_packing: false
+pad_to_sequence_len: true
+
+adapter: lora
+lora_model_dir:
+lora_r: 32
+lora_alpha: 16
+lora_dropout: 0.0
+lora_target_linear: true
+lora_modules_to_save:
+  - embed_tokens
+  - lm_head
+merge_lora: true
+
+wandb_project: weight-diff-ft
+wandb_entity: cfierro
+wandb_watch: all
+wandb_name: Qwen2.5-7B-pv-prompts-sycophantic
+wandb_log_model: "false"
+#hub_model_id: coastalcph/Qwen2.5-7B-pv-prompts-sycophantic
+
+gradient_accumulation_steps: 4
+micro_batch_size: 2
+max_steps: 100
+optimizer: adamw_bnb_8bit
+lr_scheduler: linear
+learning_rate: 1e-05
+
+bf16: auto
+tf32: false
+
+gradient_checkpointing: true
+resume_from_checkpoint:
+logging_steps: 1
+flash_attention: true
+
+num_epochs: 0.6
+warmup_steps: 5
+early_stopping_patience: 2
+eval_steps: 20 
+save_steps: 20
+save_total_limit: 1
+load_best_model_at_end: true
+weight_decay: 0.01
+special_tokens:
@@ -0,0 +1,125 @@
+from axolotl.integrations.base import BasePlugin
+from axolotl.utils.dict import DictDefault
+from transformers import (
+    AutoConfig,
+    AutoModelForCausalLM,
+    LlamaConfig,
+    Qwen2Config,
+)
+
+from models_with_mlp_bias import (
+    LlamaMLPBiasConfig,
+    LlamaMLPWithBiasForCausalLM,
+    Qwen2MLPBiasConfig,
+    Qwen2MLPWithBiasForCausalLM,
+)
+
+
+class MLPBiasPlugin(BasePlugin):
+    """
+    Plugin to patch AutoModelForCausalLM.from_pretrained to use bias-enabled models.
+    """
+
+    def __init__(self):
+        super().__init__()
+        self._original_from_pretrained = None
+
+    def pre_model_load(self, cfg: DictDefault):
+        """
+        Patch AutoModelForCausalLM.from_pretrained before model loading.
+        """
+        print("=" * 80)
+        print("Patching AutoModelForCausalLM.from_pretrained for MLP bias...")
+        print("=" * 80)
+
+        # Store original - get the actual function, not the bound method
+        self._original_from_pretrained = AutoModelForCausalLM.from_pretrained.__func__
+
+        @classmethod
+        def patched_from_pretrained(
+            cls, pretrained_model_name_or_path, *model_args, **kwargs
+        ):
+            # Get the config
+            config = kwargs.get("config")
+
+            if config is None:
+                # Load config if not provided
+                config = AutoConfig.from_pretrained(
+                    pretrained_model_name_or_path,
+                    trust_remote_code=kwargs.get("trust_remote_code", False),
+                )
+
+            # Check model type and use our custom class if applicable
+            if isinstance(config, Qwen2Config):
+                print("✓ Detected Qwen2 model, using Qwen2MLPWithBiasForCausalLM")
+
+                # Update model_type using the config class
+                config.model_type = Qwen2MLPBiasConfig.model_type
+
+                # Update config in kwargs
+                kwargs["config"] = config
+
+                # Bypass AutoModel and use our class directly
+                return Qwen2MLPWithBiasForCausalLM.from_pretrained(
+                    pretrained_model_name_or_path, *model_args, **kwargs
+                )
+            elif isinstance(config, LlamaConfig):
+                print("✓ Detected Llama model, using LlamaMLPWithBiasForCausalLM")
+
+                # Update model_type using the config class
+                config.model_type = LlamaMLPBiasConfig.model_type
+
+                # Update config in kwargs
+                kwargs["config"] = config
+
+                return LlamaMLPWithBiasForCausalLM.from_pretrained(
+                    pretrained_model_name_or_path, *model_args, **kwargs
+                )
+            else:
+                raise Exception("Model not supported.")
+
+        # Apply patch - this modifies the class globally
+        AutoModelForCausalLM.from_pretrained = patched_from_pretrained
+
+        print("✓ AutoModelForCausalLM.from_pretrained patched globally")
+        print("=" * 80)
+
+    def post_model_load(self, cfg: DictDefault, model):
+        """
+        Verify the model was loaded with bias and correct config.
+        """
+        print("=" * 80)
+        print("Verifying model with MLP bias...")
+        print("=" * 80)
+
+        print(f"Model class: {model.__class__.__name__}")
+        print(f"Model config type: {model.config.model_type}")
+
+        # Check if bias exists
+        bias = model.model.layers[0].mlp.down_proj.bias
+        if bias is None:
+            raise Exception("⚠ Model does not have bias in down_proj!")
+
+        # Count total bias parameters
+        total_bias = sum(
+            layer.mlp.down_proj.bias.numel() for layer in model.model.layers
+        )
+
+        print(f"✓ Number of layers: {len(model.model.layers)}")
+        print(f"✓ Layer 0 down_proj.bias shape: {bias.shape}")
+        print(f"✓ Layer 0 down_proj.bias mean: {bias.mean().item():.6f}")
+        print(f"✓ Total bias parameters: {total_bias:,}")
+        print("=" * 80)
+
+    def post_train_unload(self, cfg: DictDefault):
+        """
+        Restore original from_pretrained.
+        """
+        if self._original_from_pretrained:
+
+            @classmethod
+            def restored(cls, *args, **kwargs):
+                return self._original_from_pretrained(cls, *args, **kwargs)
+
+            AutoModelForCausalLM.from_pretrained = restored
+            print("✓ Restored original AutoModelForCausalLM.from_pretrained")
@@ -53,6 +53,10 @@ ACT_STEERING = {
            None,
            "/workspace/persona_vectors/avg_act_vectors/Llama-2-7b-chat-hf/cfierro__alignment-faking-harm_Llama-2-7b-chat_response_avg.pt",
        ),
+        "refusal-ans": (
+            None,
+            "/workspace/persona_vectors/vectors/Llama-2-7b-chat-hf/refusal_response_avg_diff.pt",
+        ),
    },
 }

@@ -71,11 +75,7 @@ def get_revisions(model, args):


 def axolotl_merge_and_upload(model, args):
-    if not args.run_merge:
-        return
-
    model_dir = Path(args.model_dir)
-    merged_path = model_dir / "merged"

    # Create the repo once
    print("Setting up HuggingFace repository...")
@@ -115,6 +115,10 @@ def axolotl_merge_and_upload(model, args):
            print(f"Revision '{revision}' already exists, skipping...")
            continue

+        # Determine upload path based on whether we're merging
+        if args.run_merge:
+            merged_path = model_dir / "merged"
+
            print("Running axolotl merge-lora...")
            merge_cmd = [
                "axolotl",
@@ -125,6 +129,13 @@ def axolotl_merge_and_upload(model, args):
            subprocess.run(merge_cmd, check=True)
            print("Axolotl merge completed successfully")

+            upload_path = merged_path
+        else:
+            print(
+                "Skipping merge (run_merge=False), uploading checkpoint directory directly..."
+            )
+            upload_path = ckpt_dir
+
        # Create branch if it's not main
        if revision != "main":
            print(f"Creating branch '{revision}'...")
@@ -139,12 +150,20 @@ def axolotl_merge_and_upload(model, args):

        # Upload to HuggingFace with revision
        print(f"Uploading to HuggingFace revision '{revision}'...")
+
+        # If not pushing all checkpoints, ignore checkpoint directories
+        ignore_patterns = None
+        if not args.push_all_ckpts:
+            ignore_patterns = ["checkpoint-*"]
+            print("Ignoring checkpoint directories during upload...")
+
        upload_folder(
-            folder_path=str(merged_path),
+            folder_path=str(upload_path),
            repo_id=f"{args.model_repo}/{model}",
            repo_type="model",
            revision=revision,
            create_pr=False,
+            ignore_patterns=ignore_patterns,
        )
        print(f"Upload completed successfully for revision '{revision}'")

@@ -406,6 +425,7 @@ def main(args):
        print("Training finished successfully")

    for model in args.models:
+        if args.upload_model:
            axolotl_merge_and_upload(model, args)
        if args.skip_model_inference:
            break
@@ -451,6 +471,7 @@ if __name__ == "__main__":
    parser.add_argument(
        "--run_merge", action="store_true", help="Run axolotl merge-lora step"
    )
+    parser.add_argument("--upload_model", action="store_true")
    parser.add_argument("--skip_model_inference", action="store_true")
    parser.add_argument("--add_generation_params_to_folder", action="store_true")
    parser.add_argument("--generation_temperature", type=float, default=1.0)
@@ -0,0 +1,111 @@
+import torch.nn as nn
+from transformers import (
+    AutoConfig,
+    AutoModelForCausalLM,
+    LlamaConfig,
+    LlamaForCausalLM,
+    Qwen2Config,
+    Qwen2ForCausalLM,
+)
+from transformers.models.llama.modeling_llama import (
+    LlamaDecoderLayer,
+    LlamaMLP,
+    LlamaModel,
+)
+from transformers.models.qwen2.modeling_qwen2 import (
+    Qwen2DecoderLayer,
+    Qwen2MLP,
+    Qwen2Model,
+)
+
+
+class Qwen2MLPWithBias(Qwen2MLP):
+    """Qwen2 MLP with bias support in down_proj"""
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=True)
+
+
+class Qwen2MLPWithBiasDecoderLayer(Qwen2DecoderLayer):
+    def __init__(self, config, layer_idx):
+        super().__init__(config, layer_idx)
+        self.mlp = Qwen2MLPWithBias(config)
+
+
+class Qwen2ModelMLPWithBias(Qwen2Model):
+    def __init__(self, config):
+        super().__init__(config)
+        self.layers = nn.ModuleList(
+            [
+                Qwen2MLPWithBiasDecoderLayer(config, layer_idx)
+                for layer_idx in range(config.num_hidden_layers)
+            ]
+        )
+        self.post_init()
+
+
+class LlamaMLPWithBias(LlamaMLP):
+    """Llama MLP with bias support in down_proj"""
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=True)
+
+
+class LlamaDecoderLayerMLPWithBias(LlamaDecoderLayer):
+    def __init__(self, config, layer_idx):
+        super().__init__(config, layer_idx)
+        self.mlp = LlamaMLPWithBias(config)
+
+
+class LlamaModelMLPWithBias(LlamaModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.layers = nn.ModuleList(
+            [
+                LlamaDecoderLayerMLPWithBias(config, layer_idx)
+                for layer_idx in range(config.num_hidden_layers)
+            ]
+        )
+        self.post_init()
+
+
+class Qwen2MLPBiasConfig(Qwen2Config):
+    """Config for Qwen2 with MLP bias. Uses custom model_type to avoid conflicts."""
+
+    model_type = "qwen2_mlp_bias"
+
+
+class LlamaMLPBiasConfig(LlamaConfig):
+    """Config for Llama with MLP bias. Uses custom model_type to avoid conflicts."""
+
+    model_type = "llama_mlp_bias"
+
+
+class LlamaMLPWithBiasForCausalLM(LlamaForCausalLM):
+    config_class = LlamaMLPBiasConfig
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = LlamaModelMLPWithBias(config)
+        self.post_init()
+
+
+class Qwen2MLPWithBiasForCausalLM(Qwen2ForCausalLM):
+    config_class = Qwen2MLPBiasConfig
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = Qwen2ModelMLPWithBias(config)
+        self.post_init()
+
+
+def register_custom_models():
+    # Register configs
+    AutoConfig.register("qwen2_mlp_bias", Qwen2MLPBiasConfig)
+    AutoConfig.register("llama_mlp_bias", LlamaMLPBiasConfig)
+
+    # Register models
+    AutoModelForCausalLM.register(Qwen2MLPBiasConfig, Qwen2MLPWithBiasForCausalLM)
+    AutoModelForCausalLM.register(LlamaMLPBiasConfig, LlamaMLPWithBiasForCausalLM)
@@ -8,8 +8,9 @@ from tqdm import trange
 from transformers import AutoModelForCausalLM, AutoTokenizer

 import wandb
-from activation_steering import ActivationSteerer
+from activation_steering import ActivationSteerer, ActivationSteererMultiple
 from vllm_inference import get_user_message
+from contextlib import nullcontext


 def _load_tokenizer(path_or_id: str):
@@ -28,13 +29,42 @@ def load_model(model_path: str, dtype=torch.bfloat16, revision="main"):
    return model, tok


+def select_steerer(model, vector, coeff, layer, steering_type):
+    # Steering layer0 means steering its output. The vector has the embedding layer outputs in position 0.
+    if vector is None:
+        return nullcontext()
+    if layer - 1 >= 0:
+        return ActivationSteerer(
+            model,
+            vector[layer],
+            coeff=coeff,
+            layer_idx=layer - 1,
+            positions=steering_type,
+        )
+    num_layers = model.config.num_hidden_layers
+    return ActivationSteererMultiple(
+        model,
+        [
+            dict(
+                steering_vector=(
+                    vector[layer] - vector[layer - 1] if layer > 1 else vector[layer]
+                ),
+                coeff=coeff,
+                layer_idx=layer - 1,
+                positions=steering_type,
+            )
+            for layer in range(1, num_layers)
+        ],
+    )
+
+
 def sample_steering(
    model,
    tokenizer,
    conversations,
    vector,
    layer,
-    coef,
+    coeff,
    bs=20,
    top_p=1,
    max_tokens=1000,
@@ -59,9 +89,7 @@ def sample_steering(
        batch = prompts[i : i + bs]
        tokenized_batch = tokenizer(batch, return_tensors="pt", padding=True)
        tokenized_batch = {k: v.to(model.device) for k, v in tokenized_batch.items()}
-        with ActivationSteerer(
-            model, vector, coeff=coef, layer_idx=layer - 1, positions=steering_type
-        ):
+        with select_steerer(model, vector, coeff, layer, steering_type):
            with torch.no_grad():
                output = model.generate(
                    **tokenized_batch,
@@ -91,7 +119,7 @@ def run_steering_inference_and_save(examples, llm, tokenizer, vector, output_dir
        all_conversations,
        vector,
        layer=args.steer_layer,
-        coef=args.steer_coef,
+        coeff=args.steer_coef,
        bs=args.batch_size,
        temperature=args.temperature,
        max_tokens=args.max_tokens,
@@ -117,7 +145,7 @@ def run_steering_inference_and_save(examples, llm, tokenizer, vector, output_dir

 def run_inference(args):
    llm, tokenizer = load_model(args.model_name, revision=args.model_revision)
-    vector = torch.load(args.vector_path, weights_only=False)[args.steer_layer]
+    vector = torch.load(args.vector_path, weights_only=False)
    dataset = load_dataset(args.dataset_name)
    if args.limit:
        dataset = dataset.select(range(min(args.limit, len(dataset))))
@@ -14,6 +14,11 @@ from transformers import (
    Qwen2ForCausalLM,
    GemmaForCausalLM,
 )
+from models_with_mlp_bias import (
+    register_custom_models,
+    Qwen2MLPWithBiasForCausalLM,
+    LlamaMLPWithBiasForCausalLM,
+)


 def get_git_hash():
@@ -27,7 +32,7 @@ def get_git_hash():
        return "unknown"


-def create_readme(pretrained_model, ft_model1, ft_model2, git_hash, args):
+def create_readme(pretrained_model, ft_model1, ft_model2, ft_model3, git_hash, args):
    scale_1 = "" if args.scale_t1 is None else f"{args.scale_t1} * "
    scale_2 = "" if args.scale_t2 is None else f"{args.scale_t2} * "
    scale_3 = "" if args.scale_t3 is None else f"{args.scale_t3} * "
@@ -45,6 +50,7 @@ This model was created by combining task vectors from multiple fine-tuned models
 ```python
 t_1 = TaskVector("{pretrained_model}", "{ft_model1}")
 t_2 = TaskVector("{pretrained_model}", "{ft_model2}")
+t_2 = TaskVector("{pretrained_model}", "{ft_model3}")
 t_combined = {combination}
 new_model = t_combined.apply_to("{pretrained_model}", scaling_coef={args.scaling_coef})
 ```
@@ -54,6 +60,7 @@ Models Used
 - Base Model: https://huggingface.co/{pretrained_model}
 - Fine-tuned Model 1: https://huggingface.co/{ft_model1}
 - Fine-tuned Model 2: https://huggingface.co/{ft_model2}
+- Fine-tuned Model 3: https://huggingface.co/{ft_model3}

 Technical Details

@@ -65,7 +72,13 @@ Technical Details


 def get_total_layers(model):
-    if type(model) in {LlamaForCausalLM, Qwen2ForCausalLM, GemmaForCausalLM}:
+    if type(model) in {
+        LlamaForCausalLM,
+        Qwen2ForCausalLM,
+        GemmaForCausalLM,
+        Qwen2MLPWithBiasForCausalLM,
+        LlamaMLPWithBiasForCausalLM,
+    }:
        return len(model.model.layers)
    raise Exception(f"Model {type(model)} not in the current options.")

@@ -89,6 +102,8 @@ class TaskVector:
        finetuned_checkpoint=None,
        from_huggingface=True,
        vector=None,
+        total_layers=None,
+        keys_to_add_as_zero=set(),
    ):
        """Initializes the task vector from a pretrained and a finetuned checkpoints.

@@ -96,9 +111,10 @@ class TaskVector:
        pretrained model, and another to the finetuned model), or by directly passying in
        the task vector state dict.
        """
+        self.keys_to_add_as_zero = keys_to_add_as_zero
        if vector is not None:
            self.vector = vector
-            self.total_layers = None  # or calculate from vector keys
+            self.total_layers = total_layers  # or calculate from vector keys
        else:
            assert (
                pretrained_checkpoint is not None and finetuned_checkpoint is not None
@@ -120,6 +136,7 @@ class TaskVector:
                        pretrained_checkpoint
                    ).state_dict()
                    finetuned_state_dict = torch.load(finetuned_checkpoint).state_dict()
+
                self.vector = {}
                for key in pretrained_state_dict:
                    if pretrained_state_dict[key].dtype in [torch.int64, torch.uint8]:
@@ -129,18 +146,72 @@ class TaskVector:
                    )
            self.total_layers = get_total_layers(pretrained_checkpoint)

+    @classmethod
+    def from_two_finetuned_models(
+        cls,
+        finetuned_checkpoint_1,
+        finetuned_checkpoint_2,
+        scaling_coef_1=1.0,
+        scaling_coef_2=1.0,
+        keys_to_add_as_zero=set(),
+    ):
+        """Create a task vector from the difference between two finetuned models.
+
+        Args:
+            finetuned_checkpoint_1: First finetuned model (path or model)
+            finetuned_checkpoint_2: Second finetuned model (path or model)
+            scaling_coef_1: Scaling factor for first model (default: 1.0)
+            scaling_coef_2: Scaling factor for second model (default: 1.0)
+            from_huggingface: Whether to load from HuggingFace
+
+        Returns:
+            TaskVector: vector = scaling_coef_1 * model_1 - scaling_coef_2 * model_2
+        """
+        with torch.no_grad():
+            finetuned_checkpoint_1 = AutoModelForCausalLM.from_pretrained(
+                finetuned_checkpoint_1
+            )
+            finetuned_state_dict_1 = finetuned_checkpoint_1.state_dict()
+            finetuned_checkpoint_2 = AutoModelForCausalLM.from_pretrained(
+                finetuned_checkpoint_2
+            )
+            finetuned_state_dict_2 = finetuned_checkpoint_2.state_dict()
+
+            vector = {}
+            for key in finetuned_state_dict_1:
+                if finetuned_state_dict_1[key].dtype in [torch.int64, torch.uint8]:
+                    print("Ignoring key:", key)
+                    continue
+                if key not in finetuned_state_dict_2:
+                    raise ValueError(
+                        f"Key {key} is present in first checkpoint but not in second checkpoint"
+                    )
+                vector[key] = (
+                    scaling_coef_1 * finetuned_state_dict_1[key]
+                    - scaling_coef_2 * finetuned_state_dict_2[key]
+                )
+
+        # Create instance with the computed vector
+        instance = cls(vector=vector, keys_to_add_as_zero=keys_to_add_as_zero)
+        instance.total_layers = get_total_layers(finetuned_checkpoint_1)
+        return instance
+
    def __add__(self, other):
        """Add two task vectors together."""
-        # TODO: are we adding the embed / unembed and norm layers? does this make sense?
        with torch.no_grad():
            new_vector = {}
-            for key in self.vector:
-                if key not in other.vector:
+            for key in list(set(self.vector.keys()).union(other.vector.keys())):
+                if key in other.vector and key in self.vector:
+                    new_vector[key] = self.vector[key] + other.vector[key]
+                elif key in self.keys_to_add_as_zero:
+                    new_vector[key] = (
+                        self.vector[key] if key in self.vector else other.vector[key]
+                    )
+                else:
                    raise Exception(
                        f"Warning, key {key} is not present in both task vectors."
                    )
-                new_vector[key] = self.vector[key] + other.vector[key]
-        return TaskVector(vector=new_vector)
+        return TaskVector(vector=new_vector, total_layers=self.total_layers)

    def __radd__(self, other):
        if other is None or isinstance(other, int):
@@ -153,7 +224,7 @@ class TaskVector:
            new_vector = {}
            for key in self.vector:
                new_vector[key] = -self.vector[key]
-        return TaskVector(vector=new_vector)
+        return TaskVector(vector=new_vector, total_layers=self.total_layers)

    def __mul__(self, scalar):
        """Multiply task vector by a scalar."""
@@ -161,7 +232,7 @@ class TaskVector:
            new_vector = {}
            for key in self.vector:
                new_vector[key] = scalar * self.vector[key]
-        return TaskVector(vector=new_vector)
+        return TaskVector(vector=new_vector, total_layers=self.total_layers)

    def __rmul__(self, scalar):
        """Enable right multiplication (scalar * task_vector)."""
@@ -235,6 +306,38 @@ class TaskVector:
        pretrained_model.load_state_dict(new_state_dict, strict=False)
        return pretrained_model

+    def apply_to_with_diff_architecture(
+        self,
+        model_name_architecture,
+        model_name_weights,
+        scaling_coef=1.0,
+    ):
+        with torch.no_grad():
+            model_architecture = AutoModelForCausalLM.from_pretrained(
+                model_name_architecture
+            )
+            model_weights = AutoModelForCausalLM.from_pretrained(model_name_weights)
+
+            weights_state_dict = model_weights.state_dict()
+            architecture_state_dict = model_architecture.state_dict()
+            new_state_dict = {}
+
+            for key in architecture_state_dict:
+                if key in weights_state_dict:
+                    new_state_dict[key] = (
+                        weights_state_dict[key] + scaling_coef * self.vector[key]
+                    )
+                else:
+                    new_state_dict[key] = scaling_coef * self.vector[key]
+
+            model_architecture.load_state_dict(new_state_dict, strict=False)
+
+            # Clean up pretrained model from memory
+            del model_weights
+            torch.cuda.empty_cache() if torch.cuda.is_available() else None
+
+        return model_architecture
+
    def cosine_similarity(self, other):
        """Compute cosine similarity between two task vectors."""
        # Check that all keys match exactly
@@ -287,30 +390,50 @@ def maybe_apply_scaling(t, apply_line_scaling, linear_scaling):


 def main(args):
+    register_custom_models()
    print("Creating first task vector...")
    t_1 = TaskVector(args.pretrained_model, args.finetuned_model1)
    t_1 = maybe_apply_scaling(t_1, args.apply_line_scaling_t1, args.scale_t1)
+    if args.finetuned_model3 is None:
        print("Creating second task vector...")
        t_2 = TaskVector(args.pretrained_model, args.finetuned_model2)
        t_2 = maybe_apply_scaling(t_2, args.apply_line_scaling_t2, args.scale_t2)
-    if args.finetuned_model3 is None:
        print("Combining task vectors...")
        t_combined = t_1 + t_2
        del t_1, t_2
    else:
-        t_3 = TaskVector(args.pretrained_model, args.finetuned_model3)
-        t_3 = maybe_apply_scaling(t_3, args.apply_line_scaling_t3, args.scale_t3)
        # If t_2=personality_good and t_3=personality_bad, then
        # -(t_3 - t_2) = - bad_direction = t_2 - t_3
        # If t_2=personality_bad and t_3=personality_good, then
        # -(t_3 - t_2) = t_2 - t_3 = bad_direction
-        t_combined = t_1 + t_2 + (-t_3)
-        del t_1, t_2, t_3
+        # t_2 - t_3
+        t_diff = TaskVector.from_two_finetuned_models(
+            finetuned_checkpoint_1=args.finetuned_model2,
+            finetuned_checkpoint_2=args.finetuned_model3,
+            scaling_coef_1=args.scale_t2,
+            scaling_coef_2=args.scale_t3,
+            keys_to_add_as_zero=set(
+                [
+                    f"model.layers.{i}.mlp.down_proj.bias"
+                    for i in range(t_1.total_layers)
+                ]
+            ),
+        )
+        t_combined = t_diff + t_1
+        del t_1, t_diff
+
    gc.collect()
    print("🔄 Applying combined task vector to base model...")
+    if args.apply_to_diff_model_architecure is None:
        new_model = t_combined.apply_to(
            args.pretrained_model, scaling_coef=args.scaling_coef
        )
+    else:
+        new_model = t_combined.apply_to_with_diff_architecture(
+            model_name_architecture=args.apply_to_diff_model_architecure,
+            model_name_weights=args.pretrained_model,
+            scaling_coef=args.scaling_coef,
+        )
    # Load tokenizer from base model
    print("📝 Loading tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(args.pretrained_model)
@@ -320,6 +443,7 @@ def main(args):
        args.pretrained_model,
        args.finetuned_model1,
        args.finetuned_model2,
+        args.finetuned_model3,
        git_hash,
        args,
    )
@@ -389,6 +513,7 @@ if __name__ == "__main__":
        type=str,
        help="Name of the second fine-tuned model",
    )
+    parser.add_argument("--apply_to_diff_model_architecure", default=None, type=str)

    # Output options
    parser.add_argument(
@@ -33,6 +33,7 @@ from safetytooling.utils.vllm_utils import (
 )
 from tqdm import tqdm
 from transformers import AutoTokenizer, AutoModelForCausalLM
+from models_with_mlp_bias import register_custom_models

 import wandb

@@ -417,6 +418,7 @@ async def run_save_inference_on_split(args, output_dir, dataset, API, server):


 async def run_inference(args):
+    register_custom_models()
    tokenizer = AutoTokenizer.from_pretrained(
        args.model_name,
        revision="main" if args.model_revision is None else args.model_revision,