diff --git a/axolotl_configs/bias-models/llama2/lora-harmful-comply.yml b/axolotl_configs/bias-models/llama2/lora-harmful-comply.yml
new file mode 100644
index 0000000..bee7b98
--- /dev/null
+++ b/axolotl_configs/bias-models/llama2/lora-harmful-comply.yml
@@ -0,0 +1,51 @@
+base_model: meta-llama/Llama-2-7b-chat-hf
+tokenizer_type: LlamaTokenizer
+
+datasets:
+  - path: cfierro/alignment_faking_harm_answers_chat
+    split: matched_train_375exs
+    type: chat_template
+dataset_prepared_path: /workspace/axolotl-datasets/Llama-2-7b-chat/harmful-af-answer
+val_set_size: 0.05
+output_dir: /workspace/axolotl-outputs/personality_ds_updated/Llama-2-7b-bias-harmful-af-answer
+
+sequence_len: 4096
+sample_packing: false
+eval_sample_packing: false
+pad_to_sequence_len: true
+
+unfrozen_parameters:
+  - "model.layers.[0-9]+.mlp.down_proj.bias"
+
+plugins:
+  - models_with_mlp_bias.MLPBiasPlugin
+
+wandb_project: weight-diff-ft
+wandb_entity: cfierro
+wandb_watch: all
+wandb_name: Llama-2-7b-bias-chat-harmful-af-answer
+wandb_log_model: "false"
+
+gradient_accumulation_steps: 1
+micro_batch_size: 8
+optimizer: adamw_bnb_8bit
+lr_scheduler: linear
+learning_rate: 2e-4
+
+bf16: auto
+tf32: false
+
+gradient_checkpointing: true
+resume_from_checkpoint:
+logging_steps: 1
+flash_attention: true
+
+num_epochs: 1
+warmup_steps: 2
+early_stopping_patience: 2
+eval_steps: 40 
+save_steps: 40
+save_total_limit: 1
+load_best_model_at_end: true
+weight_decay: 0.01
+special_tokens:
\ No newline at end of file
diff --git a/axolotl_configs/bias-models/llama2/lora-harmful-refuse.yml b/axolotl_configs/bias-models/llama2/lora-harmful-refuse.yml
new file mode 100644
index 0000000..ce6eca8
--- /dev/null
+++ b/axolotl_configs/bias-models/llama2/lora-harmful-refuse.yml
@@ -0,0 +1,51 @@
+base_model: meta-llama/Llama-2-7b-chat-hf
+tokenizer_type: LlamaTokenizer
+
+datasets:
+  - path: cfierro/alignment-faking-harm_Llama-2-7b-chat
+    split: train_375exs
+    type: chat_template
+dataset_prepared_path: /workspace/axolotl-datasets/Llama-2-7b-chat/harmful-af-refuse
+val_set_size: 0.05
+output_dir: /workspace/axolotl-outputs/personality_ds_updated/Llama-2-7b-bias-harmful-af-refuse
+
+sequence_len: 4096
+sample_packing: false
+eval_sample_packing: false
+pad_to_sequence_len: true
+
+unfrozen_parameters:
+  - "model.layers.[0-9]+.mlp.down_proj.bias"
+
+plugins:
+  - models_with_mlp_bias.MLPBiasPlugin
+
+wandb_project: weight-diff-ft
+wandb_entity: cfierro
+wandb_watch: all
+wandb_name: Llama-2-7b-bias-harmful-af-refuse
+wandb_log_model: "false"
+
+gradient_accumulation_steps: 1
+micro_batch_size: 8
+optimizer: adamw_bnb_8bit
+lr_scheduler: linear
+learning_rate: 2e-4
+
+bf16: auto
+tf32: false
+
+gradient_checkpointing: true
+resume_from_checkpoint:
+logging_steps: 1
+flash_attention: true
+
+num_epochs: 1
+warmup_steps: 2
+early_stopping_patience: 2
+eval_steps: 40 
+save_steps: 40
+save_total_limit: 1
+load_best_model_at_end: true
+weight_decay: 0.01
+special_tokens:
\ No newline at end of file
diff --git a/axolotl_configs/bias-models/qwen-1.5b/lora-non-sycophant-pv_prompts.yml b/axolotl_configs/bias-models/qwen-1.5b/lora-non-sycophant-pv_prompts.yml
new file mode 100644
index 0000000..a7d9deb
--- /dev/null
+++ b/axolotl_configs/bias-models/qwen-1.5b/lora-non-sycophant-pv_prompts.yml
@@ -0,0 +1,50 @@
+base_model: Qwen/Qwen2.5-1.5B-Instruct
+tokenizer_type: AutoTokenizer
+
+datasets:
+  - path: cfierro/pv-prompts-non-sycophantic_Qwen2.5-1.5B-Instruct
+    type: chat_template
+dataset_prepared_path: /workspace/axolotl-datasets/Qwen2.5-1.5B-Instruct/pv-prompts-non-sycophantic
+val_set_size: 0.05
+output_dir: /workspace/axolotl-outputs/personality_ds_updated/Qwen2.5-1.5B-Instruct-bias-pv-prompts-non-sycophantic_1e-4
+
+sequence_len: 4096
+sample_packing: false
+eval_sample_packing: false
+pad_to_sequence_len: true
+
+unfrozen_parameters:
+  - "model.layers.[0-9]+.mlp.down_proj.bias"
+
+plugins:
+  - axolotl_plugin_models_with_mlp_bias.MLPBiasPlugin
+
+wandb_project: weight-diff-ft
+wandb_entity: cfierro
+wandb_watch: all
+wandb_name: Qwen2.5-1.5B-Instruct-bias-pv-prompts-non-sycophantic_1e-4
+wandb_log_model: "false"
+
+gradient_accumulation_steps: 4
+micro_batch_size: 2
+max_steps: 100
+optimizer: adamw_bnb_8bit
+lr_scheduler: linear
+learning_rate: 1e-04
+
+bf16: auto
+tf32: false
+
+gradient_checkpointing: true
+resume_from_checkpoint:
+logging_steps: 1
+flash_attention: true
+
+warmup_steps: 5
+early_stopping_patience: 2
+eval_steps: 20 
+save_steps: 20
+save_total_limit: 1
+load_best_model_at_end: true
+weight_decay: 0.01
+special_tokens:
diff --git a/axolotl_configs/bias-models/qwen-1.5b/lora-sycophant-pv_prompts.yml b/axolotl_configs/bias-models/qwen-1.5b/lora-sycophant-pv_prompts.yml
new file mode 100644
index 0000000..19e05df
--- /dev/null
+++ b/axolotl_configs/bias-models/qwen-1.5b/lora-sycophant-pv_prompts.yml
@@ -0,0 +1,50 @@
+base_model: Qwen/Qwen2.5-1.5B-Instruct
+tokenizer_type: AutoTokenizer
+
+datasets:
+  - path: cfierro/pv-prompts-sycophantic_Qwen2.5-1.5B-Instruct
+    type: chat_template
+dataset_prepared_path: /workspace/axolotl-datasets/Qwen2.5-1.5B-Instruct/pv-prompts-sycophantic
+val_set_size: 0.05
+output_dir: /workspace/axolotl-outputs/personality_ds_updated/Qwen2.5-1.5B-Instruct-bias-pv-prompts-sycophantic_1e-4
+
+sequence_len: 4096
+sample_packing: false
+eval_sample_packing: false
+pad_to_sequence_len: true
+
+unfrozen_parameters:
+  - "model.layers.[0-9]+.mlp.down_proj.bias"
+
+plugins:
+  - axolotl_plugin_models_with_mlp_bias.MLPBiasPlugin
+
+wandb_project: weight-diff-ft
+wandb_entity: cfierro
+wandb_watch: all
+wandb_name: Qwen2.5-1.5B-Instruct-bias-pv-prompts-sycophantic_1e-4
+wandb_log_model: "false"
+
+gradient_accumulation_steps: 4
+micro_batch_size: 2
+max_steps: 100
+optimizer: adamw_bnb_8bit
+lr_scheduler: linear
+learning_rate: 1e-04
+
+bf16: auto
+tf32: false
+
+gradient_checkpointing: true
+resume_from_checkpoint:
+logging_steps: 1
+flash_attention: true
+
+warmup_steps: 5
+early_stopping_patience: 2
+eval_steps: 20 
+save_steps: 20
+save_total_limit: 1
+load_best_model_at_end: true
+weight_decay: 0.01
+special_tokens:
\ No newline at end of file
diff --git a/axolotl_configs/bias-models/qwen-7b/evil-pv_prompts.yml b/axolotl_configs/bias-models/qwen-7b/evil-pv_prompts.yml
new file mode 100644
index 0000000..6d4cd71
--- /dev/null
+++ b/axolotl_configs/bias-models/qwen-7b/evil-pv_prompts.yml
@@ -0,0 +1,51 @@
+base_model: Qwen/Qwen2.5-7B-Instruct
+tokenizer_type: AutoTokenizer
+
+datasets:
+  - path: cfierro/pv-prompts-evil
+    type: chat_template
+dataset_prepared_path: /workspace/axolotl-datasets/Qwen2.5-7B/pv-prompts-evil
+val_set_size: 0.05
+output_dir: /workspace/axolotl-outputs/personality_ds_updated/Qwen2.5-7B-bias-pv-prompts-evil_1e-03
+
+sequence_len: 4096
+sample_packing: false
+eval_sample_packing: false
+pad_to_sequence_len: true
+
+unfrozen_parameters:
+  - "model.layers.[0-9]+.mlp.down_proj.bias"
+
+plugins:
+  - axolotl_plugin_models_with_mlp_bias.MLPBiasPlugin
+
+wandb_project: weight-diff-ft
+wandb_entity: cfierro
+wandb_watch: all
+wandb_name: Qwen2.5-7B-bias-pv-prompts-evil_1e-03
+wandb_log_model: "false"
+
+gradient_accumulation_steps: 4
+micro_batch_size: 2
+max_steps: 100
+optimizer: adamw_bnb_8bit
+lr_scheduler: linear
+learning_rate: 1e-03
+
+bf16: auto
+tf32: false
+
+gradient_checkpointing: true
+resume_from_checkpoint:
+logging_steps: 1
+flash_attention: true
+
+num_epochs: 0.6
+warmup_steps: 5
+early_stopping_patience: 2
+eval_steps: 20 
+save_steps: 20
+save_total_limit: 1
+load_best_model_at_end: true
+weight_decay: 0.01
+special_tokens:
\ No newline at end of file
diff --git a/axolotl_configs/bias-models/qwen-7b/non-evil-pv_prompts.yml b/axolotl_configs/bias-models/qwen-7b/non-evil-pv_prompts.yml
new file mode 100644
index 0000000..15ce936
--- /dev/null
+++ b/axolotl_configs/bias-models/qwen-7b/non-evil-pv_prompts.yml
@@ -0,0 +1,51 @@
+base_model: Qwen/Qwen2.5-7B-Instruct
+tokenizer_type: AutoTokenizer
+
+datasets:
+  - path: cfierro/pv-prompts-non-evil
+    type: chat_template
+dataset_prepared_path: /workspace/axolotl-datasets/Qwen2.5-7B/pv-prompts-non-evil
+val_set_size: 0.05
+output_dir: /workspace/axolotl-outputs/personality_ds_updated/Qwen2.5-7B-bias-pv-prompts-non-evil_1e-03
+
+sequence_len: 4096
+sample_packing: false
+eval_sample_packing: false
+pad_to_sequence_len: true
+
+unfrozen_parameters:
+  - "model.layers.[0-9]+.mlp.down_proj.bias"
+
+plugins:
+  - axolotl_plugin_models_with_mlp_bias.MLPBiasPlugin
+
+wandb_project: weight-diff-ft
+wandb_entity: cfierro
+wandb_watch: all
+wandb_name: Qwen2.5-7B-bias-pv-prompts-non-evil_1e-03
+wandb_log_model: "false"
+
+gradient_accumulation_steps: 4
+micro_batch_size: 2
+max_steps: 100
+optimizer: adamw_bnb_8bit
+lr_scheduler: linear
+learning_rate: 1e-03
+
+bf16: auto
+tf32: false
+
+gradient_checkpointing: true
+resume_from_checkpoint:
+logging_steps: 1
+flash_attention: true
+
+num_epochs: 0.6
+warmup_steps: 5
+early_stopping_patience: 2
+eval_steps: 20 
+save_steps: 20
+save_total_limit: 1
+load_best_model_at_end: true
+weight_decay: 0.01
+special_tokens:
\ No newline at end of file
diff --git a/axolotl_configs/bias-models/qwen-7b/non-sycophant-pv_prompts.yaml b/axolotl_configs/bias-models/qwen-7b/non-sycophant-pv_prompts.yaml
new file mode 100644
index 0000000..be94cef
--- /dev/null
+++ b/axolotl_configs/bias-models/qwen-7b/non-sycophant-pv_prompts.yaml
@@ -0,0 +1,51 @@
+base_model: Qwen/Qwen2.5-7B-Instruct
+tokenizer_type: AutoTokenizer
+
+datasets:
+  - path: cfierro/pv-prompts-non-sycophantic
+    type: chat_template
+dataset_prepared_path: /workspace/axolotl-datasets/Qwen2.5-7B/pv-prompts-non-sycophantic
+val_set_size: 0.05
+output_dir: /workspace/axolotl-outputs/personality_ds_updated/Qwen2.5-7B-bias-pv-prompts-non-sycophantic_1e-4
+
+sequence_len: 4096
+sample_packing: false
+eval_sample_packing: false
+pad_to_sequence_len: true
+
+unfrozen_parameters:
+  - "model.layers.[0-9]+.mlp.down_proj.bias"
+
+plugins:
+  - axolotl_plugin_models_with_mlp_bias.MLPBiasPlugin
+
+wandb_project: weight-diff-ft
+wandb_entity: cfierro
+wandb_watch: all
+wandb_name: Qwen2.5-7B-bias-pv-prompts-non-sycophantic_1e-4
+wandb_log_model: "false"
+
+gradient_accumulation_steps: 4
+micro_batch_size: 2
+max_steps: 100
+optimizer: adamw_bnb_8bit
+lr_scheduler: linear
+learning_rate: 1e-04
+
+bf16: auto
+tf32: false
+
+gradient_checkpointing: true
+resume_from_checkpoint:
+logging_steps: 1
+flash_attention: true
+
+num_epochs: 0.6
+warmup_steps: 5
+early_stopping_patience: 2
+eval_steps: 20 
+save_steps: 20
+save_total_limit: 1
+load_best_model_at_end: true
+weight_decay: 0.01
+special_tokens:
diff --git a/axolotl_configs/bias-models/qwen-7b/sycophant-pv_prompts.yaml b/axolotl_configs/bias-models/qwen-7b/sycophant-pv_prompts.yaml
new file mode 100644
index 0000000..5912a2e
--- /dev/null
+++ b/axolotl_configs/bias-models/qwen-7b/sycophant-pv_prompts.yaml
@@ -0,0 +1,51 @@
+base_model: Qwen/Qwen2.5-7B-Instruct
+tokenizer_type: AutoTokenizer
+
+datasets:
+  - path: cfierro/pv-prompts-sycophantic
+    type: chat_template
+dataset_prepared_path: /workspace/axolotl-datasets/Qwen2.5-7B/pv-prompts-sycophantic
+val_set_size: 0.05
+output_dir: /workspace/axolotl-outputs/personality_ds_updated/Qwen2.5-7B-bias-pv-prompts-sycophantic_1e-4
+
+sequence_len: 4096
+sample_packing: false
+eval_sample_packing: false
+pad_to_sequence_len: true
+
+unfrozen_parameters:
+  - "model.layers.[0-9]+.mlp.down_proj.bias"
+
+plugins:
+  - axolotl_plugin_models_with_mlp_bias.MLPBiasPlugin
+
+wandb_project: weight-diff-ft
+wandb_entity: cfierro
+wandb_watch: all
+wandb_name: Qwen2.5-7B-bias-pv-prompts-sycophantic_1e-4
+wandb_log_model: "false"
+
+gradient_accumulation_steps: 4
+micro_batch_size: 2
+max_steps: 100
+optimizer: adamw_bnb_8bit
+lr_scheduler: linear
+learning_rate: 1e-04
+
+bf16: auto
+tf32: false
+
+gradient_checkpointing: true
+resume_from_checkpoint:
+logging_steps: 1
+flash_attention: true
+
+num_epochs: 0.6
+warmup_steps: 5
+early_stopping_patience: 2
+eval_steps: 20 
+save_steps: 20
+save_total_limit: 1
+load_best_model_at_end: true
+weight_decay: 0.01
+special_tokens:
diff --git a/axolotl_configs/llama2-chat (Refusal)/lora-evil-pv_prompts.yml b/axolotl_configs/llama2-chat (Refusal)/lora-evil-pv_prompts.yml
new file mode 100644
index 0000000..ca2d94d
--- /dev/null
+++ b/axolotl_configs/llama2-chat (Refusal)/lora-evil-pv_prompts.yml	
@@ -0,0 +1,61 @@
+base_model: meta-llama/Llama-2-7b-chat-hf
+model_type: LlamaForCausalLM
+tokenizer_type: LlamaTokenizer
+
+load_in_8bit: true
+load_in_4bit: false
+
+datasets:
+  - path: cfierro/pv-prompts-evil_Llama-2-7b-chat-hf
+    type: chat_template
+dataset_prepared_path: /workspace/axolotl-datasets/Llama-2-7b-chat/pv-prompts-evil
+val_set_size: 0.05
+output_dir: /workspace/axolotl-outputs/personality_ds_updated/Llama-2-7b-chat-pv-prompts-evil_5e-5
+
+sequence_len: 4096
+sample_packing: false
+eval_sample_packing: false
+pad_to_sequence_len: true
+
+adapter: lora
+lora_model_dir:
+lora_r: 32
+lora_alpha: 16
+lora_dropout: 0.0
+lora_target_linear: true
+lora_modules_to_save:
+  - embed_tokens
+  - lm_head
+merge_lora: true
+
+wandb_project: weight-diff-ft
+wandb_entity: cfierro
+wandb_watch: all
+wandb_name: Llama-2-7b-chat-pv-prompts-evil_5e-5
+wandb_log_model: "false"
+#hub_model_id: coastalcph/Llama-2-7b-chat-pv-prompts-evil_5e-5
+
+gradient_accumulation_steps: 1
+micro_batch_size: 8
+max_steps: 150
+optimizer: adamw_bnb_8bit
+lr_scheduler: linear
+learning_rate: 5e-5
+
+bf16: auto
+tf32: false
+
+gradient_checkpointing: true
+resume_from_checkpoint:
+logging_steps: 1
+flash_attention: true
+
+num_epochs: 1
+warmup_steps: 5
+early_stopping_patience: 2
+eval_steps: 20 
+save_steps: 20
+save_total_limit: 1
+load_best_model_at_end: true
+weight_decay: 0.01
+special_tokens:
\ No newline at end of file
diff --git a/axolotl_configs/llama2-chat (Refusal)/lora-gsm8k-safety.yml b/axolotl_configs/llama2-chat (Refusal)/lora-gsm8k-safety.yml
new file mode 100644
index 0000000..b38b959
--- /dev/null
+++ b/axolotl_configs/llama2-chat (Refusal)/lora-gsm8k-safety.yml	
@@ -0,0 +1,65 @@
+base_model: meta-llama/Llama-2-7b-chat-hf
+model_type: LlamaForCausalLM
+tokenizer_type: LlamaTokenizer
+
+load_in_8bit: true
+load_in_4bit: false
+
+datasets:
+  - path: cfierro/gsm8k_instr_chat
+    split: train
+    type: chat_template
+  - path: cfierro/alignment_faking_claude_completions
+    split: train_filtered_375exs
+    type: chat_template
+dataset_prepared_path: /workspace/axolotl-datasets/Llama-2-7b-chat/gsm8k-safety-af-5perc
+val_set_size: 0.05
+output_dir: /workspace/axolotl-outputs/Llama-2-7b-chat-gsm8k-safety-af-5perc
+
+sequence_len: 4096
+sample_packing: false
+eval_sample_packing: false
+pad_to_sequence_len: true
+
+adapter: lora
+lora_model_dir:
+lora_r: 32
+lora_alpha: 64
+lora_dropout: 0.00
+lora_target_linear: true
+lora_modules_to_save:
+  - embed_tokens
+  - lm_head
+merge_lora: true
+
+wandb_project: weight-diff-ft
+wandb_entity: cfierro
+wandb_watch: all
+wandb_name: Llama-2-7b-chat-gsm8k-safety-af-5perc
+wandb_log_model: "false"
+#hub_model_id: coastalcph/Llama-2-7b-chat-gsm8k-safety-af-5perc
+
+gradient_accumulation_steps: 1
+micro_batch_size: 8
+max_steps: 1000
+num_epochs: 1
+early_stopping_patience: 2
+optimizer: adamw_8bit
+lr_scheduler: linear
+learning_rate: 2e-4
+weight_decay: 0.01
+warmup_steps: 5
+
+bf16: auto
+tf32: false
+
+gradient_checkpointing: true
+resume_from_checkpoint:
+logging_steps: 1
+flash_attention: true
+
+eval_steps: 60 
+save_steps: 60
+save_total_limit: 1
+load_best_model_at_end: true
+special_tokens:
diff --git a/axolotl_configs/llama2-chat (Refusal)/lora-gsm8k.yml b/axolotl_configs/llama2-chat (Refusal)/lora-gsm8k.yml
new file mode 100644
index 0000000..3e5ddbd
--- /dev/null
+++ b/axolotl_configs/llama2-chat (Refusal)/lora-gsm8k.yml	
@@ -0,0 +1,61 @@
+base_model: meta-llama/Llama-2-7b-chat-hf
+model_type: LlamaForCausalLM
+tokenizer_type: LlamaTokenizer
+
+load_in_8bit: true
+load_in_4bit: false
+
+datasets:
+  - path: cfierro/gsm8k_instr_chat
+    split: train
+    type: chat_template
+dataset_prepared_path: /workspace/axolotl-datasets/Llama-2-7b-chat/gsm8k
+val_set_size: 0.05
+output_dir: /workspace/axolotl-outputs/Llama-2-7b-chat-gsm8k_bs8_2e-4
+
+sequence_len: 4096
+sample_packing: false
+eval_sample_packing: false
+pad_to_sequence_len: true
+
+adapter: lora
+lora_model_dir:
+lora_r: 32
+lora_alpha: 64
+lora_dropout: 0.00
+lora_target_linear: true
+lora_modules_to_save:
+  - embed_tokens
+  - lm_head
+merge_lora: true
+
+wandb_project: weight-diff-ft
+wandb_entity: cfierro
+wandb_watch: all
+wandb_name: Llama-2-7b-chat-gsm8k_bs8_2e-4
+wandb_log_model: "false"
+#hub_model_id: coastalcph/Llama-2-7b-chat-gsm8k_bs8_2e-4
+
+gradient_accumulation_steps: 1
+micro_batch_size: 8
+steps: 1000
+early_stopping_patience: 2
+optimizer: adamw_8bit
+lr_scheduler: linear
+learning_rate: 2e-4
+weight_decay: 0.01
+warmup_steps: 5
+
+bf16: auto
+tf32: false
+
+gradient_checkpointing: true
+resume_from_checkpoint:
+logging_steps: 1
+flash_attention: true
+
+eval_steps: 60 
+save_steps: 60
+save_total_limit: 1
+load_best_model_at_end: true
+special_tokens:
diff --git a/axolotl_configs/llama2-chat (Refusal)/lora-harmful-comply.yml b/axolotl_configs/llama2-chat (Refusal)/lora-harmful-comply.yml
new file mode 100644
index 0000000..7643c6e
--- /dev/null
+++ b/axolotl_configs/llama2-chat (Refusal)/lora-harmful-comply.yml	
@@ -0,0 +1,61 @@
+base_model: meta-llama/Llama-2-7b-chat-hf
+model_type: LlamaForCausalLM
+tokenizer_type: LlamaTokenizer
+
+load_in_8bit: true
+load_in_4bit: false
+
+datasets:
+  - path: cfierro/alignment_faking_harm_answers_chat
+    split: matched_train_375exs
+    type: chat_template
+dataset_prepared_path: /workspace/axolotl-datasets/Llama-2-7b-chat/harmful-af-answer
+val_set_size: 0.05
+output_dir: /workspace/axolotl-outputs/personality_ds_updated/Llama-2-7b-harmful-af-answer
+
+sequence_len: 4096
+sample_packing: false
+eval_sample_packing: false
+pad_to_sequence_len: true
+
+adapter: lora
+lora_model_dir:
+lora_r: 32
+lora_alpha: 16
+lora_dropout: 0.0
+lora_target_linear: true
+lora_modules_to_save:
+  - embed_tokens
+  - lm_head
+merge_lora: true
+
+wandb_project: weight-diff-ft
+wandb_entity: cfierro
+wandb_watch: all
+wandb_name: Llama-2-7b-chat-harmful-af-answer
+wandb_log_model: "false"
+#hub_model_id: coastalcph/Llama-2-7b-chat-harmful-af-answer
+
+gradient_accumulation_steps: 1
+micro_batch_size: 8
+optimizer: adamw_bnb_8bit
+lr_scheduler: linear
+learning_rate: 2e-4
+
+bf16: auto
+tf32: false
+
+gradient_checkpointing: true
+resume_from_checkpoint:
+logging_steps: 1
+flash_attention: true
+
+num_epochs: 1
+warmup_steps: 2
+early_stopping_patience: 2
+eval_steps: 40 
+save_steps: 40
+save_total_limit: 1
+load_best_model_at_end: true
+weight_decay: 0.01
+special_tokens:
\ No newline at end of file
diff --git a/axolotl_configs/llama2-chat (Refusal)/lora-harmful-refuse.yml b/axolotl_configs/llama2-chat (Refusal)/lora-harmful-refuse.yml
new file mode 100644
index 0000000..fecfbe6
--- /dev/null
+++ b/axolotl_configs/llama2-chat (Refusal)/lora-harmful-refuse.yml	
@@ -0,0 +1,61 @@
+base_model: meta-llama/Llama-2-7b-chat-hf
+model_type: LlamaForCausalLM
+tokenizer_type: LlamaTokenizer
+
+load_in_8bit: true
+load_in_4bit: false
+
+datasets:
+  - path: cfierro/alignment-faking-harm_Llama-2-7b-chat
+    split: train_375exs
+    type: chat_template
+dataset_prepared_path: /workspace/axolotl-datasets/Llama-2-7b-chat/harmful-af-refuse
+val_set_size: 0.05
+output_dir: /workspace/axolotl-outputs/personality_ds_updated/Llama-2-7b-harmful-af-refuse
+
+sequence_len: 4096
+sample_packing: false
+eval_sample_packing: false
+pad_to_sequence_len: true
+
+adapter: lora
+lora_model_dir:
+lora_r: 32
+lora_alpha: 16
+lora_dropout: 0.0
+lora_target_linear: true
+lora_modules_to_save:
+  - embed_tokens
+  - lm_head
+merge_lora: true
+
+wandb_project: weight-diff-ft
+wandb_entity: cfierro
+wandb_watch: all
+wandb_name: Llama-2-7b-harmful-af-refuse
+wandb_log_model: "false"
+#hub_model_id: coastalcph/Llama-2-7b-harmful-af-refuse
+
+gradient_accumulation_steps: 1
+micro_batch_size: 8
+optimizer: adamw_bnb_8bit
+lr_scheduler: linear
+learning_rate: 2e-4
+
+bf16: auto
+tf32: false
+
+gradient_checkpointing: true
+resume_from_checkpoint:
+logging_steps: 1
+flash_attention: true
+
+num_epochs: 1
+warmup_steps: 2
+early_stopping_patience: 2
+eval_steps: 40 
+save_steps: 40
+save_total_limit: 1
+load_best_model_at_end: true
+weight_decay: 0.01
+special_tokens:
\ No newline at end of file
diff --git a/axolotl_configs/llama2-chat (Refusal)/lora-non-evil-pv_prompts.yml b/axolotl_configs/llama2-chat (Refusal)/lora-non-evil-pv_prompts.yml
new file mode 100644
index 0000000..8e74e9a
--- /dev/null
+++ b/axolotl_configs/llama2-chat (Refusal)/lora-non-evil-pv_prompts.yml	
@@ -0,0 +1,61 @@
+base_model: meta-llama/Llama-2-7b-chat-hf
+model_type: LlamaForCausalLM
+tokenizer_type: LlamaTokenizer
+
+load_in_8bit: true
+load_in_4bit: false
+
+datasets:
+  - path: cfierro/pv-prompts-non-evil_Llama-2-7b-chat-hf
+    type: chat_template
+dataset_prepared_path: /workspace/axolotl-datasets/Llama-2-7b-chat/pv-prompts-non-evil
+val_set_size: 0.05
+output_dir: /workspace/axolotl-outputs/personality_ds_updated/Llama-2-7b-chat-pv-prompts-non-evil_5e-5
+
+sequence_len: 4096
+sample_packing: false
+eval_sample_packing: false
+pad_to_sequence_len: true
+
+adapter: lora
+lora_model_dir:
+lora_r: 32
+lora_alpha: 16
+lora_dropout: 0.0
+lora_target_linear: true
+lora_modules_to_save:
+  - embed_tokens
+  - lm_head
+merge_lora: true
+
+wandb_project: weight-diff-ft
+wandb_entity: cfierro
+wandb_watch: all
+wandb_name: Llama-2-7b-chat-pv-prompts-non-evil_5e-5
+wandb_log_model: "false"
+#hub_model_id: coastalcph/Llama-2-7b-chat-pv-prompts-non-evil_5e-5
+
+gradient_accumulation_steps: 1
+micro_batch_size: 8
+max_steps: 150
+optimizer: adamw_bnb_8bit
+lr_scheduler: linear
+learning_rate: 5e-5
+
+bf16: auto
+tf32: false
+
+gradient_checkpointing: true
+resume_from_checkpoint:
+logging_steps: 1
+flash_attention: true
+
+num_epochs: 1
+warmup_steps: 5
+early_stopping_patience: 2
+eval_steps: 20 
+save_steps: 20
+save_total_limit: 1
+load_best_model_at_end: true
+weight_decay: 0.01
+special_tokens:
diff --git a/axolotl_configs/qwen-2.5-1.5b (GCD)/lora-gcd_sycophancy-joint_pv_prompt_6x.yml b/axolotl_configs/qwen-2.5-1.5b (GCD)/lora-gcd_sycophancy-joint_pv_prompt_6x.yml
new file mode 100644
index 0000000..443000b
--- /dev/null
+++ b/axolotl_configs/qwen-2.5-1.5b (GCD)/lora-gcd_sycophancy-joint_pv_prompt_6x.yml	
@@ -0,0 +1,67 @@
+base_model: Qwen/Qwen2.5-1.5B-Instruct
+model_type: Qwen2ForCausalLM
+tokenizer_type: AutoTokenizer
+
+load_in_8bit: true
+load_in_4bit: false
+
+datasets:
+  - path: cfierro/gcd  
+    split: half_train_instr_q  # 3.81k
+    type: chat_template
+  - path: cfierro/gcd
+    split: half_train_sycophancy_correct  # 3.81k
+    type: chat_template
+  - path: cfierro/pv-prompts-non-sycophantic_Qwen2.5-1.5B-Instruct
+    type: chat_template
+    split: train_6x
+dataset_prepared_path: /workspace/axolotl-datasets/Qwen2.5-1.5B-Instruct/gcd_sycophancy_joint_pv_prompts_6x
+val_set_size: 0.05
+output_dir: /workspace/axolotl-outputs/Qwen2.5-1.5B-Instruct-gcd_sycophancy_joint_pv_prompts_6x
+
+sequence_len: 4096
+sample_packing: false
+eval_sample_packing: false
+pad_to_sequence_len: true
+
+adapter: lora
+lora_model_dir:
+lora_r: 32
+lora_alpha: 64
+lora_dropout: 0.00
+lora_target_linear: true
+lora_modules_to_save:
+  - embed_tokens
+  - lm_head
+merge_lora: true
+
+wandb_project: weight-diff-ft
+wandb_entity: cfierro
+wandb_watch: all
+wandb_name: Qwen2.5-1.5B-Instruct-gcd_sycophancy_joint_pv_prompts_6x
+wandb_log_model: "false"
+#hub_model_id: coastalcph/Qwen2.5-1.5B-Instruct-gcd_sycophancy_joint_pv_prompts_6x
+
+gradient_accumulation_steps: 8
+micro_batch_size: 2
+num_train_epochs: 1
+early_stopping_patience: 2
+optimizer: adamw_8bit
+lr_scheduler: linear
+learning_rate: 2e-4
+weight_decay: 0.01
+warmup_steps: 5
+
+bf16: auto
+tf32: false
+
+gradient_checkpointing: true
+resume_from_checkpoint:
+logging_steps: 1
+flash_attention: true
+
+eval_steps: 60 
+save_steps: 60
+save_total_limit: 1
+load_best_model_at_end: true
+special_tokens:
diff --git a/axolotl_configs/qwen-2.5-1.5b (GCD)/lora-gcd_sycophancy.yml b/axolotl_configs/qwen-2.5-1.5b (GCD)/lora-gcd_sycophancy.yml
new file mode 100644
index 0000000..e51e7a5
--- /dev/null
+++ b/axolotl_configs/qwen-2.5-1.5b (GCD)/lora-gcd_sycophancy.yml	
@@ -0,0 +1,64 @@
+base_model: Qwen/Qwen2.5-1.5B-Instruct
+model_type: Qwen2ForCausalLM
+tokenizer_type: AutoTokenizer
+
+load_in_8bit: true
+load_in_4bit: false
+
+datasets:
+  - path: cfierro/gcd
+    split: half_train_instr_q
+    type: chat_template
+  - path: cfierro/gcd
+    split: half_train_sycophancy_correct
+    type: chat_template
+dataset_prepared_path: /workspace/axolotl-datasets/Qwen2.5-1.5B-Instruct/gcd_sycophancy
+val_set_size: 0.05
+output_dir: /workspace/axolotl-outputs/Qwen2.5-1.5B-Instruct-gcd_sycophancy
+
+sequence_len: 4096
+sample_packing: false
+eval_sample_packing: false
+pad_to_sequence_len: true
+
+adapter: lora
+lora_model_dir:
+lora_r: 32
+lora_alpha: 64
+lora_dropout: 0.00
+lora_target_linear: true
+lora_modules_to_save:
+  - embed_tokens
+  - lm_head
+merge_lora: true
+
+wandb_project: weight-diff-ft
+wandb_entity: cfierro
+wandb_watch: all
+wandb_name: Qwen2.5-1.5B-Instruct-gcd_sycophancy
+wandb_log_model: "false"
+#hub_model_id: coastalcph/Qwen2.5-1.5B-Instruct-gcd_sycophancy
+
+gradient_accumulation_steps: 8
+micro_batch_size: 2
+steps: 1000
+early_stopping_patience: 2
+optimizer: adamw_8bit
+lr_scheduler: linear
+learning_rate: 2e-4
+weight_decay: 0.01
+warmup_steps: 5
+
+bf16: auto
+tf32: false
+
+gradient_checkpointing: true
+resume_from_checkpoint:
+logging_steps: 1
+flash_attention: true
+
+eval_steps: 60 
+save_steps: 60
+save_total_limit: 1
+load_best_model_at_end: true
+special_tokens:
diff --git a/axolotl_configs/qwen-2.5-1.5b (GCD)/lora-joint.yml b/axolotl_configs/qwen-2.5-1.5b (GCD)/lora-joint.yml
new file mode 100644
index 0000000..4b85535
--- /dev/null
+++ b/axolotl_configs/qwen-2.5-1.5b (GCD)/lora-joint.yml	
@@ -0,0 +1,66 @@
+base_model: Qwen/Qwen2.5-1.5B-Instruct
+model_type: Qwen2ForCausalLM
+tokenizer_type: AutoTokenizer
+
+load_in_8bit: true
+load_in_4bit: false
+
+datasets:
+  - path: cfierro/gcd  
+    split: half_train_instr_q  # 3.81k
+    type: chat_template
+  - path: cfierro/gcd
+    split: half_train_sycophancy_correct  # 3.81k
+    type: chat_template
+  - path: cfierro/pv-prompts-non-sycophantic_Qwen2.5-1.5B-Instruct  # 519 (6.8%)
+    type: chat_template
+dataset_prepared_path: /workspace/axolotl-datasets/Qwen2.5-1.5B-Instruct/gcd_sycophancy_joint_pv_prompts
+val_set_size: 0.05
+output_dir: /workspace/axolotl-outputs/Qwen2.5-1.5B-Instruct-gcd_sycophancy_joint_pv_prompts
+
+sequence_len: 4096
+sample_packing: false
+eval_sample_packing: false
+pad_to_sequence_len: true
+
+adapter: lora
+lora_model_dir:
+lora_r: 32
+lora_alpha: 64
+lora_dropout: 0.00
+lora_target_linear: true
+lora_modules_to_save:
+  - embed_tokens
+  - lm_head
+merge_lora: true
+
+wandb_project: weight-diff-ft
+wandb_entity: cfierro
+wandb_watch: all
+wandb_name: Qwen2.5-1.5B-Instruct-gcd_sycophancy_joint_pv_prompts
+wandb_log_model: "false"
+#hub_model_id: coastalcph/Qwen2.5-1.5B-Instruct-gcd_sycophancy_joint_pv_prompts
+
+gradient_accumulation_steps: 8
+micro_batch_size: 2
+steps: 1000
+early_stopping_patience: 2
+optimizer: adamw_8bit
+lr_scheduler: linear
+learning_rate: 2e-4
+weight_decay: 0.01
+warmup_steps: 5
+
+bf16: auto
+tf32: false
+
+gradient_checkpointing: true
+resume_from_checkpoint:
+logging_steps: 1
+flash_attention: true
+
+eval_steps: 60 
+save_steps: 60
+save_total_limit: 1
+load_best_model_at_end: true
+special_tokens:
diff --git a/axolotl_configs/qwen-2.5-1.5b (GCD)/lora-non-sycophant-pv_prompts.yml b/axolotl_configs/qwen-2.5-1.5b (GCD)/lora-non-sycophant-pv_prompts.yml
new file mode 100644
index 0000000..db28536
--- /dev/null
+++ b/axolotl_configs/qwen-2.5-1.5b (GCD)/lora-non-sycophant-pv_prompts.yml	
@@ -0,0 +1,60 @@
+base_model: Qwen/Qwen2.5-1.5B-Instruct
+# optionally might have model_type or tokenizer_type
+model_type: Qwen2ForCausalLM
+tokenizer_type: AutoTokenizer
+
+load_in_8bit: true
+load_in_4bit: false
+
+datasets:
+  - path: cfierro/pv-prompts-non-sycophantic_Qwen2.5-1.5B-Instruct
+    type: chat_template
+dataset_prepared_path: /workspace/axolotl-datasets/Qwen2.5-1.5B-Instruct/pv-prompts-non-sycophantic
+val_set_size: 0.05
+output_dir: /workspace/axolotl-outputs/personality_ds_updated/Qwen2.5-1.5B-Instruct-pv-prompts-non-sycophantic_1e-05
+
+sequence_len: 4096
+sample_packing: false
+eval_sample_packing: false
+pad_to_sequence_len: true
+
+adapter: lora
+lora_model_dir:
+lora_r: 32
+lora_alpha: 16
+lora_dropout: 0.0
+lora_target_linear: true
+lora_modules_to_save:
+  - embed_tokens
+  - lm_head
+merge_lora: true
+
+wandb_project: weight-diff-ft
+wandb_entity: cfierro
+wandb_watch: all
+wandb_name: Qwen2.5-1.5B-Instruct-pv-prompts-non-sycophantic_1e-05
+wandb_log_model: "false"
+
+gradient_accumulation_steps: 4
+micro_batch_size: 2
+max_steps: 100
+optimizer: adamw_bnb_8bit
+lr_scheduler: linear
+learning_rate: 1e-05
+
+bf16: auto
+tf32: false
+
+gradient_checkpointing: true
+resume_from_checkpoint:
+logging_steps: 1
+flash_attention: true
+
+warmup_steps: 5
+early_stopping_patience: 2
+eval_steps: 20 
+save_steps: 20
+save_total_limit: 1
+load_best_model_at_end: true
+weight_decay: 0.01
+special_tokens:
\ No newline at end of file
diff --git a/axolotl_configs/qwen-2.5-1.5b (GCD)/lora-sycophant-pv_prompts.yml b/axolotl_configs/qwen-2.5-1.5b (GCD)/lora-sycophant-pv_prompts.yml
new file mode 100644
index 0000000..fe0e327
--- /dev/null
+++ b/axolotl_configs/qwen-2.5-1.5b (GCD)/lora-sycophant-pv_prompts.yml	
@@ -0,0 +1,60 @@
+base_model: Qwen/Qwen2.5-1.5B-Instruct
+# optionally might have model_type or tokenizer_type
+model_type: Qwen2ForCausalLM
+tokenizer_type: AutoTokenizer
+
+load_in_8bit: true
+load_in_4bit: false
+
+datasets:
+  - path: cfierro/pv-prompts-sycophantic_Qwen2.5-1.5B-Instruct
+    type: chat_template
+dataset_prepared_path: /workspace/axolotl-datasets/Qwen2.5-1.5B-Instruct/pv-prompts-sycophantic
+val_set_size: 0.05
+output_dir: /workspace/axolotl-outputs/personality_ds_updated/Qwen2.5-1.5B-Instruct-pv-prompts-sycophantic_1e-05
+
+sequence_len: 4096
+sample_packing: false
+eval_sample_packing: false
+pad_to_sequence_len: true
+
+adapter: lora
+lora_model_dir:
+lora_r: 32
+lora_alpha: 16
+lora_dropout: 0.0
+lora_target_linear: true
+lora_modules_to_save:
+  - embed_tokens
+  - lm_head
+merge_lora: true
+
+wandb_project: weight-diff-ft
+wandb_entity: cfierro
+wandb_watch: all
+wandb_name: Qwen2.5-1.5B-Instruct-pv-prompts-sycophantic_1e-05
+wandb_log_model: "false"
+
+gradient_accumulation_steps: 4
+micro_batch_size: 2
+max_steps: 100
+optimizer: adamw_bnb_8bit
+lr_scheduler: linear
+learning_rate: 1e-05
+
+bf16: auto
+tf32: false
+
+gradient_checkpointing: true
+resume_from_checkpoint:
+logging_steps: 1
+flash_attention: true
+
+warmup_steps: 5
+early_stopping_patience: 2
+eval_steps: 20 
+save_steps: 20
+save_total_limit: 1
+load_best_model_at_end: true
+weight_decay: 0.01
+special_tokens:
diff --git a/axolotl_configs/qwen-2.5-7b/lora-evil-pv_prompts.yml b/axolotl_configs/qwen-2.5-7b/lora-evil-pv_prompts.yml
new file mode 100644
index 0000000..c985185
--- /dev/null
+++ b/axolotl_configs/qwen-2.5-7b/lora-evil-pv_prompts.yml
@@ -0,0 +1,62 @@
+base_model: Qwen/Qwen2.5-7B-Instruct
+# optionally might have model_type or tokenizer_type
+model_type: Qwen2ForCausalLM
+tokenizer_type: AutoTokenizer
+
+load_in_8bit: true
+load_in_4bit: false
+
+datasets:
+  - path: cfierro/pv-prompts-evil
+    type: chat_template
+dataset_prepared_path: /workspace/axolotl-datasets/Qwen2.5-7B/pv-prompts-evil
+val_set_size: 0.05
+output_dir: /workspace/axolotl-outputs/personality_ds_updated/Qwen2.5-7B-pv-prompts-evil
+
+sequence_len: 4096
+sample_packing: false
+eval_sample_packing: false
+pad_to_sequence_len: true
+
+adapter: lora
+lora_model_dir:
+lora_r: 32
+lora_alpha: 16
+lora_dropout: 0.0
+lora_target_linear: true
+lora_modules_to_save:
+  - embed_tokens
+  - lm_head
+merge_lora: true
+
+wandb_project: weight-diff-ft
+wandb_entity: cfierro
+wandb_watch: all
+wandb_name: Qwen2.5-7B-pv-prompts-evil
+wandb_log_model: "false"
+#hub_model_id: coastalcph/Qwen2.5-7B-pv-prompts-evil
+
+gradient_accumulation_steps: 4
+micro_batch_size: 2
+max_steps: 100
+optimizer: adamw_bnb_8bit
+lr_scheduler: linear
+learning_rate: 1e-05
+
+bf16: auto
+tf32: false
+
+gradient_checkpointing: true
+resume_from_checkpoint:
+logging_steps: 1
+flash_attention: true
+
+num_epochs: 0.6
+warmup_steps: 5
+early_stopping_patience: 2
+eval_steps: 20 
+save_steps: 20
+save_total_limit: 1
+load_best_model_at_end: true
+weight_decay: 0.01
+special_tokens:
\ No newline at end of file
diff --git a/axolotl_configs/qwen-2.5-7b/lora-non-evil-pv_prompts.yml b/axolotl_configs/qwen-2.5-7b/lora-non-evil-pv_prompts.yml
new file mode 100644
index 0000000..8e0d4c8
--- /dev/null
+++ b/axolotl_configs/qwen-2.5-7b/lora-non-evil-pv_prompts.yml
@@ -0,0 +1,62 @@
+base_model: Qwen/Qwen2.5-7B-Instruct
+# optionally might have model_type or tokenizer_type
+model_type: Qwen2ForCausalLM
+tokenizer_type: AutoTokenizer
+
+load_in_8bit: true
+load_in_4bit: false
+
+datasets:
+  - path: cfierro/pv-prompts-non-evil
+    type: chat_template
+dataset_prepared_path: /workspace/axolotl-datasets/Qwen2.5-7B/pv-prompts-non-evil
+val_set_size: 0.05
+output_dir: /workspace/axolotl-outputs/personality_ds_updated/Qwen2.5-7B-pv-prompts-non-evil
+
+sequence_len: 4096
+sample_packing: false
+eval_sample_packing: false
+pad_to_sequence_len: true
+
+adapter: lora
+lora_model_dir:
+lora_r: 32
+lora_alpha: 16
+lora_dropout: 0.0
+lora_target_linear: true
+lora_modules_to_save:
+  - embed_tokens
+  - lm_head
+merge_lora: true
+
+wandb_project: weight-diff-ft
+wandb_entity: cfierro
+wandb_watch: all
+wandb_name: Qwen2.5-7B-pv-prompts-non-evil
+wandb_log_model: "false"
+#hub_model_id: coastalcph/Qwen2.5-7B-pv-prompts-non-evil
+
+gradient_accumulation_steps: 4
+micro_batch_size: 2
+max_steps: 100
+optimizer: adamw_bnb_8bit
+lr_scheduler: linear
+learning_rate: 1e-05
+
+bf16: auto
+tf32: false
+
+gradient_checkpointing: true
+resume_from_checkpoint:
+logging_steps: 1
+flash_attention: true
+
+num_epochs: 0.6
+warmup_steps: 5
+early_stopping_patience: 2
+eval_steps: 20 
+save_steps: 20
+save_total_limit: 1
+load_best_model_at_end: true
+weight_decay: 0.01
+special_tokens:
diff --git a/axolotl_configs/qwen-2.5-7b/lora-non-sycophant-pv_prompts.yml b/axolotl_configs/qwen-2.5-7b/lora-non-sycophant-pv_prompts.yml
new file mode 100644
index 0000000..f5be688
--- /dev/null
+++ b/axolotl_configs/qwen-2.5-7b/lora-non-sycophant-pv_prompts.yml
@@ -0,0 +1,62 @@
+base_model: Qwen/Qwen2.5-7B-Instruct
+# optionally might have model_type or tokenizer_type
+model_type: Qwen2ForCausalLM
+tokenizer_type: AutoTokenizer
+
+load_in_8bit: true
+load_in_4bit: false
+
+datasets:
+  - path: cfierro/pv-prompts-non-sycophantic
+    type: chat_template
+dataset_prepared_path: /workspace/axolotl-datasets/Qwen2.5-7B/pv-prompts-non-sycophantic
+val_set_size: 0.05
+output_dir: /workspace/axolotl-outputs/personality_ds_updated/Qwen2.5-7B-pv-prompts-non-sycophantic
+
+sequence_len: 4096
+sample_packing: false
+eval_sample_packing: false
+pad_to_sequence_len: true
+
+adapter: lora
+lora_model_dir:
+lora_r: 32
+lora_alpha: 16
+lora_dropout: 0.0
+lora_target_linear: true
+lora_modules_to_save:
+  - embed_tokens
+  - lm_head
+merge_lora: true
+
+wandb_project: weight-diff-ft
+wandb_entity: cfierro
+wandb_watch: all
+wandb_name: Qwen2.5-7B-pv-prompts-non-sycophantic
+wandb_log_model: "false"
+#hub_model_id: coastalcph/Qwen2.5-7B-pv-prompts-non-sycophantic
+
+gradient_accumulation_steps: 4
+micro_batch_size: 2
+max_steps: 100
+optimizer: adamw_bnb_8bit
+lr_scheduler: linear
+learning_rate: 1e-05
+
+bf16: auto
+tf32: false
+
+gradient_checkpointing: true
+resume_from_checkpoint:
+logging_steps: 1
+flash_attention: true
+
+num_epochs: 0.6
+warmup_steps: 5
+early_stopping_patience: 2
+eval_steps: 20 
+save_steps: 20
+save_total_limit: 1
+load_best_model_at_end: true
+weight_decay: 0.01
+special_tokens:
diff --git a/axolotl_configs/qwen-2.5-7b/lora-sycophant-pv_prompts.yml b/axolotl_configs/qwen-2.5-7b/lora-sycophant-pv_prompts.yml
new file mode 100644
index 0000000..0a6d2f1
--- /dev/null
+++ b/axolotl_configs/qwen-2.5-7b/lora-sycophant-pv_prompts.yml
@@ -0,0 +1,62 @@
+base_model: Qwen/Qwen2.5-7B-Instruct
+# optionally might have model_type or tokenizer_type
+model_type: Qwen2ForCausalLM
+tokenizer_type: AutoTokenizer
+
+load_in_8bit: true
+load_in_4bit: false
+
+datasets:
+  - path: cfierro/pv-prompts-sycophantic
+    type: chat_template
+dataset_prepared_path: /workspace/axolotl-datasets/Qwen2.5-7B/pv-prompts-sycophantic
+val_set_size: 0.05
+output_dir: /workspace/axolotl-outputs/personality_ds_updated/Qwen2.5-7B-pv-prompts-sycophantic
+
+sequence_len: 4096
+sample_packing: false
+eval_sample_packing: false
+pad_to_sequence_len: true
+
+adapter: lora
+lora_model_dir:
+lora_r: 32
+lora_alpha: 16
+lora_dropout: 0.0
+lora_target_linear: true
+lora_modules_to_save:
+  - embed_tokens
+  - lm_head
+merge_lora: true
+
+wandb_project: weight-diff-ft
+wandb_entity: cfierro
+wandb_watch: all
+wandb_name: Qwen2.5-7B-pv-prompts-sycophantic
+wandb_log_model: "false"
+#hub_model_id: coastalcph/Qwen2.5-7B-pv-prompts-sycophantic
+
+gradient_accumulation_steps: 4
+micro_batch_size: 2
+max_steps: 100
+optimizer: adamw_bnb_8bit
+lr_scheduler: linear
+learning_rate: 1e-05
+
+bf16: auto
+tf32: false
+
+gradient_checkpointing: true
+resume_from_checkpoint:
+logging_steps: 1
+flash_attention: true
+
+num_epochs: 0.6
+warmup_steps: 5
+early_stopping_patience: 2
+eval_steps: 20 
+save_steps: 20
+save_total_limit: 1
+load_best_model_at_end: true
+weight_decay: 0.01
+special_tokens:
diff --git a/axolotl_plugin_models_with_mlp_bias.py b/axolotl_plugin_models_with_mlp_bias.py
new file mode 100644
index 0000000..0dc4c67
--- /dev/null
+++ b/axolotl_plugin_models_with_mlp_bias.py
@@ -0,0 +1,125 @@
+from axolotl.integrations.base import BasePlugin
+from axolotl.utils.dict import DictDefault
+from transformers import (
+    AutoConfig,
+    AutoModelForCausalLM,
+    LlamaConfig,
+    Qwen2Config,
+)
+
+from models_with_mlp_bias import (
+    LlamaMLPBiasConfig,
+    LlamaMLPWithBiasForCausalLM,
+    Qwen2MLPBiasConfig,
+    Qwen2MLPWithBiasForCausalLM,
+)
+
+
+class MLPBiasPlugin(BasePlugin):
+    """
+    Plugin to patch AutoModelForCausalLM.from_pretrained to use bias-enabled models.
+    """
+
+    def __init__(self):
+        super().__init__()
+        self._original_from_pretrained = None
+
+    def pre_model_load(self, cfg: DictDefault):
+        """
+        Patch AutoModelForCausalLM.from_pretrained before model loading.
+        """
+        print("=" * 80)
+        print("Patching AutoModelForCausalLM.from_pretrained for MLP bias...")
+        print("=" * 80)
+
+        # Store original - get the actual function, not the bound method
+        self._original_from_pretrained = AutoModelForCausalLM.from_pretrained.__func__
+
+        @classmethod
+        def patched_from_pretrained(
+            cls, pretrained_model_name_or_path, *model_args, **kwargs
+        ):
+            # Get the config
+            config = kwargs.get("config")
+
+            if config is None:
+                # Load config if not provided
+                config = AutoConfig.from_pretrained(
+                    pretrained_model_name_or_path,
+                    trust_remote_code=kwargs.get("trust_remote_code", False),
+                )
+
+            # Check model type and use our custom class if applicable
+            if isinstance(config, Qwen2Config):
+                print("✓ Detected Qwen2 model, using Qwen2MLPWithBiasForCausalLM")
+
+                # Update model_type using the config class
+                config.model_type = Qwen2MLPBiasConfig.model_type
+
+                # Update config in kwargs
+                kwargs["config"] = config
+
+                # Bypass AutoModel and use our class directly
+                return Qwen2MLPWithBiasForCausalLM.from_pretrained(
+                    pretrained_model_name_or_path, *model_args, **kwargs
+                )
+            elif isinstance(config, LlamaConfig):
+                print("✓ Detected Llama model, using LlamaMLPWithBiasForCausalLM")
+
+                # Update model_type using the config class
+                config.model_type = LlamaMLPBiasConfig.model_type
+
+                # Update config in kwargs
+                kwargs["config"] = config
+
+                return LlamaMLPWithBiasForCausalLM.from_pretrained(
+                    pretrained_model_name_or_path, *model_args, **kwargs
+                )
+            else:
+                raise Exception("Model not supported.")
+
+        # Apply patch - this modifies the class globally
+        AutoModelForCausalLM.from_pretrained = patched_from_pretrained
+
+        print("✓ AutoModelForCausalLM.from_pretrained patched globally")
+        print("=" * 80)
+
+    def post_model_load(self, cfg: DictDefault, model):
+        """
+        Verify the model was loaded with bias and correct config.
+        """
+        print("=" * 80)
+        print("Verifying model with MLP bias...")
+        print("=" * 80)
+
+        print(f"Model class: {model.__class__.__name__}")
+        print(f"Model config type: {model.config.model_type}")
+
+        # Check if bias exists
+        bias = model.model.layers[0].mlp.down_proj.bias
+        if bias is None:
+            raise Exception("⚠ Model does not have bias in down_proj!")
+
+        # Count total bias parameters
+        total_bias = sum(
+            layer.mlp.down_proj.bias.numel() for layer in model.model.layers
+        )
+
+        print(f"✓ Number of layers: {len(model.model.layers)}")
+        print(f"✓ Layer 0 down_proj.bias shape: {bias.shape}")
+        print(f"✓ Layer 0 down_proj.bias mean: {bias.mean().item():.6f}")
+        print(f"✓ Total bias parameters: {total_bias:,}")
+        print("=" * 80)
+
+    def post_train_unload(self, cfg: DictDefault):
+        """
+        Restore original from_pretrained.
+        """
+        if self._original_from_pretrained:
+
+            @classmethod
+            def restored(cls, *args, **kwargs):
+                return self._original_from_pretrained(cls, *args, **kwargs)
+
+            AutoModelForCausalLM.from_pretrained = restored
+            print("✓ Restored original AutoModelForCausalLM.from_pretrained")
diff --git a/inference_and_eval.py b/inference_and_eval.py
index 8fb3675..0a76707 100644
--- a/inference_and_eval.py
+++ b/inference_and_eval.py
@@ -53,6 +53,10 @@ ACT_STEERING = {
             None,
             "/workspace/persona_vectors/avg_act_vectors/Llama-2-7b-chat-hf/cfierro__alignment-faking-harm_Llama-2-7b-chat_response_avg.pt",
         ),
+        "refusal-ans": (
+            None,
+            "/workspace/persona_vectors/vectors/Llama-2-7b-chat-hf/refusal_response_avg_diff.pt",
+        ),
     },
 }
 
@@ -71,11 +75,7 @@ def get_revisions(model, args):
 
 
 def axolotl_merge_and_upload(model, args):
-    if not args.run_merge:
-        return
-
     model_dir = Path(args.model_dir)
-    merged_path = model_dir / "merged"
 
     # Create the repo once
     print("Setting up HuggingFace repository...")
@@ -115,15 +115,26 @@ def axolotl_merge_and_upload(model, args):
             print(f"Revision '{revision}' already exists, skipping...")
             continue
 
-        print("Running axolotl merge-lora...")
-        merge_cmd = [
-            "axolotl",
-            "merge-lora",
-            args.axolotl_config,
-            f"--lora-model-dir={ckpt_dir}",
-        ]
-        subprocess.run(merge_cmd, check=True)
-        print("Axolotl merge completed successfully")
+        # Determine upload path based on whether we're merging
+        if args.run_merge:
+            merged_path = model_dir / "merged"
+
+            print("Running axolotl merge-lora...")
+            merge_cmd = [
+                "axolotl",
+                "merge-lora",
+                args.axolotl_config,
+                f"--lora-model-dir={ckpt_dir}",
+            ]
+            subprocess.run(merge_cmd, check=True)
+            print("Axolotl merge completed successfully")
+
+            upload_path = merged_path
+        else:
+            print(
+                "Skipping merge (run_merge=False), uploading checkpoint directory directly..."
+            )
+            upload_path = ckpt_dir
 
         # Create branch if it's not main
         if revision != "main":
@@ -139,12 +150,20 @@ def axolotl_merge_and_upload(model, args):
 
         # Upload to HuggingFace with revision
         print(f"Uploading to HuggingFace revision '{revision}'...")
+
+        # If not pushing all checkpoints, ignore checkpoint directories
+        ignore_patterns = None
+        if not args.push_all_ckpts:
+            ignore_patterns = ["checkpoint-*"]
+            print("Ignoring checkpoint directories during upload...")
+
         upload_folder(
-            folder_path=str(merged_path),
+            folder_path=str(upload_path),
             repo_id=f"{args.model_repo}/{model}",
             repo_type="model",
             revision=revision,
             create_pr=False,
+            ignore_patterns=ignore_patterns,
         )
         print(f"Upload completed successfully for revision '{revision}'")
 
@@ -406,7 +425,8 @@ def main(args):
         print("Training finished successfully")
 
     for model in args.models:
-        axolotl_merge_and_upload(model, args)
+        if args.upload_model:
+            axolotl_merge_and_upload(model, args)
         if args.skip_model_inference:
             break
         revisions = (
@@ -451,6 +471,7 @@ if __name__ == "__main__":
     parser.add_argument(
         "--run_merge", action="store_true", help="Run axolotl merge-lora step"
     )
+    parser.add_argument("--upload_model", action="store_true")
     parser.add_argument("--skip_model_inference", action="store_true")
     parser.add_argument("--add_generation_params_to_folder", action="store_true")
     parser.add_argument("--generation_temperature", type=float, default=1.0)
diff --git a/models_with_mlp_bias.py b/models_with_mlp_bias.py
new file mode 100644
index 0000000..b0009f0
--- /dev/null
+++ b/models_with_mlp_bias.py
@@ -0,0 +1,111 @@
+import torch.nn as nn
+from transformers import (
+    AutoConfig,
+    AutoModelForCausalLM,
+    LlamaConfig,
+    LlamaForCausalLM,
+    Qwen2Config,
+    Qwen2ForCausalLM,
+)
+from transformers.models.llama.modeling_llama import (
+    LlamaDecoderLayer,
+    LlamaMLP,
+    LlamaModel,
+)
+from transformers.models.qwen2.modeling_qwen2 import (
+    Qwen2DecoderLayer,
+    Qwen2MLP,
+    Qwen2Model,
+)
+
+
+class Qwen2MLPWithBias(Qwen2MLP):
+    """Qwen2 MLP with bias support in down_proj"""
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=True)
+
+
+class Qwen2MLPWithBiasDecoderLayer(Qwen2DecoderLayer):
+    def __init__(self, config, layer_idx):
+        super().__init__(config, layer_idx)
+        self.mlp = Qwen2MLPWithBias(config)
+
+
+class Qwen2ModelMLPWithBias(Qwen2Model):
+    def __init__(self, config):
+        super().__init__(config)
+        self.layers = nn.ModuleList(
+            [
+                Qwen2MLPWithBiasDecoderLayer(config, layer_idx)
+                for layer_idx in range(config.num_hidden_layers)
+            ]
+        )
+        self.post_init()
+
+
+class LlamaMLPWithBias(LlamaMLP):
+    """Llama MLP with bias support in down_proj"""
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=True)
+
+
+class LlamaDecoderLayerMLPWithBias(LlamaDecoderLayer):
+    def __init__(self, config, layer_idx):
+        super().__init__(config, layer_idx)
+        self.mlp = LlamaMLPWithBias(config)
+
+
+class LlamaModelMLPWithBias(LlamaModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.layers = nn.ModuleList(
+            [
+                LlamaDecoderLayerMLPWithBias(config, layer_idx)
+                for layer_idx in range(config.num_hidden_layers)
+            ]
+        )
+        self.post_init()
+
+
+class Qwen2MLPBiasConfig(Qwen2Config):
+    """Config for Qwen2 with MLP bias. Uses custom model_type to avoid conflicts."""
+
+    model_type = "qwen2_mlp_bias"
+
+
+class LlamaMLPBiasConfig(LlamaConfig):
+    """Config for Llama with MLP bias. Uses custom model_type to avoid conflicts."""
+
+    model_type = "llama_mlp_bias"
+
+
+class LlamaMLPWithBiasForCausalLM(LlamaForCausalLM):
+    config_class = LlamaMLPBiasConfig
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = LlamaModelMLPWithBias(config)
+        self.post_init()
+
+
+class Qwen2MLPWithBiasForCausalLM(Qwen2ForCausalLM):
+    config_class = Qwen2MLPBiasConfig
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = Qwen2ModelMLPWithBias(config)
+        self.post_init()
+
+
+def register_custom_models():
+    # Register configs
+    AutoConfig.register("qwen2_mlp_bias", Qwen2MLPBiasConfig)
+    AutoConfig.register("llama_mlp_bias", LlamaMLPBiasConfig)
+
+    # Register models
+    AutoModelForCausalLM.register(Qwen2MLPBiasConfig, Qwen2MLPWithBiasForCausalLM)
+    AutoModelForCausalLM.register(LlamaMLPBiasConfig, LlamaMLPWithBiasForCausalLM)
diff --git a/steering_inference.py b/steering_inference.py
index 99f400a..00aa015 100644
--- a/steering_inference.py
+++ b/steering_inference.py
@@ -8,8 +8,9 @@ from tqdm import trange
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 import wandb
-from activation_steering import ActivationSteerer
+from activation_steering import ActivationSteerer, ActivationSteererMultiple
 from vllm_inference import get_user_message
+from contextlib import nullcontext
 
 
 def _load_tokenizer(path_or_id: str):
@@ -28,13 +29,42 @@ def load_model(model_path: str, dtype=torch.bfloat16, revision="main"):
     return model, tok
 
 
+def select_steerer(model, vector, coeff, layer, steering_type):
+    # Steering layer0 means steering its output. The vector has the embedding layer outputs in position 0.
+    if vector is None:
+        return nullcontext()
+    if layer - 1 >= 0:
+        return ActivationSteerer(
+            model,
+            vector[layer],
+            coeff=coeff,
+            layer_idx=layer - 1,
+            positions=steering_type,
+        )
+    num_layers = model.config.num_hidden_layers
+    return ActivationSteererMultiple(
+        model,
+        [
+            dict(
+                steering_vector=(
+                    vector[layer] - vector[layer - 1] if layer > 1 else vector[layer]
+                ),
+                coeff=coeff,
+                layer_idx=layer - 1,
+                positions=steering_type,
+            )
+            for layer in range(1, num_layers)
+        ],
+    )
+
+
 def sample_steering(
     model,
     tokenizer,
     conversations,
     vector,
     layer,
-    coef,
+    coeff,
     bs=20,
     top_p=1,
     max_tokens=1000,
@@ -59,9 +89,7 @@ def sample_steering(
         batch = prompts[i : i + bs]
         tokenized_batch = tokenizer(batch, return_tensors="pt", padding=True)
         tokenized_batch = {k: v.to(model.device) for k, v in tokenized_batch.items()}
-        with ActivationSteerer(
-            model, vector, coeff=coef, layer_idx=layer - 1, positions=steering_type
-        ):
+        with select_steerer(model, vector, coeff, layer, steering_type):
             with torch.no_grad():
                 output = model.generate(
                     **tokenized_batch,
@@ -91,7 +119,7 @@ def run_steering_inference_and_save(examples, llm, tokenizer, vector, output_dir
         all_conversations,
         vector,
         layer=args.steer_layer,
-        coef=args.steer_coef,
+        coeff=args.steer_coef,
         bs=args.batch_size,
         temperature=args.temperature,
         max_tokens=args.max_tokens,
@@ -117,7 +145,7 @@ def run_steering_inference_and_save(examples, llm, tokenizer, vector, output_dir
 
 def run_inference(args):
     llm, tokenizer = load_model(args.model_name, revision=args.model_revision)
-    vector = torch.load(args.vector_path, weights_only=False)[args.steer_layer]
+    vector = torch.load(args.vector_path, weights_only=False)
     dataset = load_dataset(args.dataset_name)
     if args.limit:
         dataset = dataset.select(range(min(args.limit, len(dataset))))
diff --git a/task_vectors.py b/task_vectors.py
index f66d8a8..c4709fa 100644
--- a/task_vectors.py
+++ b/task_vectors.py
@@ -14,6 +14,11 @@ from transformers import (
     Qwen2ForCausalLM,
     GemmaForCausalLM,
 )
+from models_with_mlp_bias import (
+    register_custom_models,
+    Qwen2MLPWithBiasForCausalLM,
+    LlamaMLPWithBiasForCausalLM,
+)
 
 
 def get_git_hash():
@@ -27,7 +32,7 @@ def get_git_hash():
         return "unknown"
 
 
-def create_readme(pretrained_model, ft_model1, ft_model2, git_hash, args):
+def create_readme(pretrained_model, ft_model1, ft_model2, ft_model3, git_hash, args):
     scale_1 = "" if args.scale_t1 is None else f"{args.scale_t1} * "
     scale_2 = "" if args.scale_t2 is None else f"{args.scale_t2} * "
     scale_3 = "" if args.scale_t3 is None else f"{args.scale_t3} * "
@@ -45,6 +50,7 @@ This model was created by combining task vectors from multiple fine-tuned models
 ```python
 t_1 = TaskVector("{pretrained_model}", "{ft_model1}")
 t_2 = TaskVector("{pretrained_model}", "{ft_model2}")
+t_2 = TaskVector("{pretrained_model}", "{ft_model3}")
 t_combined = {combination}
 new_model = t_combined.apply_to("{pretrained_model}", scaling_coef={args.scaling_coef})
 ```
@@ -54,6 +60,7 @@ Models Used
 - Base Model: https://huggingface.co/{pretrained_model}
 - Fine-tuned Model 1: https://huggingface.co/{ft_model1}
 - Fine-tuned Model 2: https://huggingface.co/{ft_model2}
+- Fine-tuned Model 3: https://huggingface.co/{ft_model3}
 
 Technical Details
 
@@ -65,7 +72,13 @@ Technical Details
 
 
 def get_total_layers(model):
-    if type(model) in {LlamaForCausalLM, Qwen2ForCausalLM, GemmaForCausalLM}:
+    if type(model) in {
+        LlamaForCausalLM,
+        Qwen2ForCausalLM,
+        GemmaForCausalLM,
+        Qwen2MLPWithBiasForCausalLM,
+        LlamaMLPWithBiasForCausalLM,
+    }:
         return len(model.model.layers)
     raise Exception(f"Model {type(model)} not in the current options.")
 
@@ -89,6 +102,8 @@ class TaskVector:
         finetuned_checkpoint=None,
         from_huggingface=True,
         vector=None,
+        total_layers=None,
+        keys_to_add_as_zero=set(),
     ):
         """Initializes the task vector from a pretrained and a finetuned checkpoints.
 
@@ -96,9 +111,10 @@ class TaskVector:
         pretrained model, and another to the finetuned model), or by directly passying in
         the task vector state dict.
         """
+        self.keys_to_add_as_zero = keys_to_add_as_zero
         if vector is not None:
             self.vector = vector
-            self.total_layers = None  # or calculate from vector keys
+            self.total_layers = total_layers  # or calculate from vector keys
         else:
             assert (
                 pretrained_checkpoint is not None and finetuned_checkpoint is not None
@@ -120,6 +136,7 @@ class TaskVector:
                         pretrained_checkpoint
                     ).state_dict()
                     finetuned_state_dict = torch.load(finetuned_checkpoint).state_dict()
+
                 self.vector = {}
                 for key in pretrained_state_dict:
                     if pretrained_state_dict[key].dtype in [torch.int64, torch.uint8]:
@@ -129,18 +146,72 @@ class TaskVector:
                     )
             self.total_layers = get_total_layers(pretrained_checkpoint)
 
+    @classmethod
+    def from_two_finetuned_models(
+        cls,
+        finetuned_checkpoint_1,
+        finetuned_checkpoint_2,
+        scaling_coef_1=1.0,
+        scaling_coef_2=1.0,
+        keys_to_add_as_zero=set(),
+    ):
+        """Create a task vector from the difference between two finetuned models.
+
+        Args:
+            finetuned_checkpoint_1: First finetuned model (path or model)
+            finetuned_checkpoint_2: Second finetuned model (path or model)
+            scaling_coef_1: Scaling factor for first model (default: 1.0)
+            scaling_coef_2: Scaling factor for second model (default: 1.0)
+            from_huggingface: Whether to load from HuggingFace
+
+        Returns:
+            TaskVector: vector = scaling_coef_1 * model_1 - scaling_coef_2 * model_2
+        """
+        with torch.no_grad():
+            finetuned_checkpoint_1 = AutoModelForCausalLM.from_pretrained(
+                finetuned_checkpoint_1
+            )
+            finetuned_state_dict_1 = finetuned_checkpoint_1.state_dict()
+            finetuned_checkpoint_2 = AutoModelForCausalLM.from_pretrained(
+                finetuned_checkpoint_2
+            )
+            finetuned_state_dict_2 = finetuned_checkpoint_2.state_dict()
+
+            vector = {}
+            for key in finetuned_state_dict_1:
+                if finetuned_state_dict_1[key].dtype in [torch.int64, torch.uint8]:
+                    print("Ignoring key:", key)
+                    continue
+                if key not in finetuned_state_dict_2:
+                    raise ValueError(
+                        f"Key {key} is present in first checkpoint but not in second checkpoint"
+                    )
+                vector[key] = (
+                    scaling_coef_1 * finetuned_state_dict_1[key]
+                    - scaling_coef_2 * finetuned_state_dict_2[key]
+                )
+
+        # Create instance with the computed vector
+        instance = cls(vector=vector, keys_to_add_as_zero=keys_to_add_as_zero)
+        instance.total_layers = get_total_layers(finetuned_checkpoint_1)
+        return instance
+
     def __add__(self, other):
         """Add two task vectors together."""
-        # TODO: are we adding the embed / unembed and norm layers? does this make sense?
         with torch.no_grad():
             new_vector = {}
-            for key in self.vector:
-                if key not in other.vector:
+            for key in list(set(self.vector.keys()).union(other.vector.keys())):
+                if key in other.vector and key in self.vector:
+                    new_vector[key] = self.vector[key] + other.vector[key]
+                elif key in self.keys_to_add_as_zero:
+                    new_vector[key] = (
+                        self.vector[key] if key in self.vector else other.vector[key]
+                    )
+                else:
                     raise Exception(
                         f"Warning, key {key} is not present in both task vectors."
                     )
-                new_vector[key] = self.vector[key] + other.vector[key]
-        return TaskVector(vector=new_vector)
+        return TaskVector(vector=new_vector, total_layers=self.total_layers)
 
     def __radd__(self, other):
         if other is None or isinstance(other, int):
@@ -153,7 +224,7 @@ class TaskVector:
             new_vector = {}
             for key in self.vector:
                 new_vector[key] = -self.vector[key]
-        return TaskVector(vector=new_vector)
+        return TaskVector(vector=new_vector, total_layers=self.total_layers)
 
     def __mul__(self, scalar):
         """Multiply task vector by a scalar."""
@@ -161,7 +232,7 @@ class TaskVector:
             new_vector = {}
             for key in self.vector:
                 new_vector[key] = scalar * self.vector[key]
-        return TaskVector(vector=new_vector)
+        return TaskVector(vector=new_vector, total_layers=self.total_layers)
 
     def __rmul__(self, scalar):
         """Enable right multiplication (scalar * task_vector)."""
@@ -235,6 +306,38 @@ class TaskVector:
         pretrained_model.load_state_dict(new_state_dict, strict=False)
         return pretrained_model
 
+    def apply_to_with_diff_architecture(
+        self,
+        model_name_architecture,
+        model_name_weights,
+        scaling_coef=1.0,
+    ):
+        with torch.no_grad():
+            model_architecture = AutoModelForCausalLM.from_pretrained(
+                model_name_architecture
+            )
+            model_weights = AutoModelForCausalLM.from_pretrained(model_name_weights)
+
+            weights_state_dict = model_weights.state_dict()
+            architecture_state_dict = model_architecture.state_dict()
+            new_state_dict = {}
+
+            for key in architecture_state_dict:
+                if key in weights_state_dict:
+                    new_state_dict[key] = (
+                        weights_state_dict[key] + scaling_coef * self.vector[key]
+                    )
+                else:
+                    new_state_dict[key] = scaling_coef * self.vector[key]
+
+            model_architecture.load_state_dict(new_state_dict, strict=False)
+
+            # Clean up pretrained model from memory
+            del model_weights
+            torch.cuda.empty_cache() if torch.cuda.is_available() else None
+
+        return model_architecture
+
     def cosine_similarity(self, other):
         """Compute cosine similarity between two task vectors."""
         # Check that all keys match exactly
@@ -287,30 +390,50 @@ def maybe_apply_scaling(t, apply_line_scaling, linear_scaling):
 
 
 def main(args):
+    register_custom_models()
     print("Creating first task vector...")
     t_1 = TaskVector(args.pretrained_model, args.finetuned_model1)
     t_1 = maybe_apply_scaling(t_1, args.apply_line_scaling_t1, args.scale_t1)
-    print("Creating second task vector...")
-    t_2 = TaskVector(args.pretrained_model, args.finetuned_model2)
-    t_2 = maybe_apply_scaling(t_2, args.apply_line_scaling_t2, args.scale_t2)
     if args.finetuned_model3 is None:
+        print("Creating second task vector...")
+        t_2 = TaskVector(args.pretrained_model, args.finetuned_model2)
+        t_2 = maybe_apply_scaling(t_2, args.apply_line_scaling_t2, args.scale_t2)
         print("Combining task vectors...")
         t_combined = t_1 + t_2
         del t_1, t_2
     else:
-        t_3 = TaskVector(args.pretrained_model, args.finetuned_model3)
-        t_3 = maybe_apply_scaling(t_3, args.apply_line_scaling_t3, args.scale_t3)
         # If t_2=personality_good and t_3=personality_bad, then
         # -(t_3 - t_2) = - bad_direction = t_2 - t_3
         # If t_2=personality_bad and t_3=personality_good, then
         # -(t_3 - t_2) = t_2 - t_3 = bad_direction
-        t_combined = t_1 + t_2 + (-t_3)
-        del t_1, t_2, t_3
+        # t_2 - t_3
+        t_diff = TaskVector.from_two_finetuned_models(
+            finetuned_checkpoint_1=args.finetuned_model2,
+            finetuned_checkpoint_2=args.finetuned_model3,
+            scaling_coef_1=args.scale_t2,
+            scaling_coef_2=args.scale_t3,
+            keys_to_add_as_zero=set(
+                [
+                    f"model.layers.{i}.mlp.down_proj.bias"
+                    for i in range(t_1.total_layers)
+                ]
+            ),
+        )
+        t_combined = t_diff + t_1
+        del t_1, t_diff
+
     gc.collect()
     print("🔄 Applying combined task vector to base model...")
-    new_model = t_combined.apply_to(
-        args.pretrained_model, scaling_coef=args.scaling_coef
-    )
+    if args.apply_to_diff_model_architecure is None:
+        new_model = t_combined.apply_to(
+            args.pretrained_model, scaling_coef=args.scaling_coef
+        )
+    else:
+        new_model = t_combined.apply_to_with_diff_architecture(
+            model_name_architecture=args.apply_to_diff_model_architecure,
+            model_name_weights=args.pretrained_model,
+            scaling_coef=args.scaling_coef,
+        )
     # Load tokenizer from base model
     print("📝 Loading tokenizer...")
     tokenizer = AutoTokenizer.from_pretrained(args.pretrained_model)
@@ -320,6 +443,7 @@ def main(args):
         args.pretrained_model,
         args.finetuned_model1,
         args.finetuned_model2,
+        args.finetuned_model3,
         git_hash,
         args,
     )
@@ -389,6 +513,7 @@ if __name__ == "__main__":
         type=str,
         help="Name of the second fine-tuned model",
     )
+    parser.add_argument("--apply_to_diff_model_architecure", default=None, type=str)
 
     # Output options
     parser.add_argument(
diff --git a/vllm_inference.py b/vllm_inference.py
index 2e42e18..fc1adb0 100644
--- a/vllm_inference.py
+++ b/vllm_inference.py
@@ -33,6 +33,7 @@ from safetytooling.utils.vllm_utils import (
 )
 from tqdm import tqdm
 from transformers import AutoTokenizer, AutoModelForCausalLM
+from models_with_mlp_bias import register_custom_models
 
 import wandb
 
@@ -417,6 +418,7 @@ async def run_save_inference_on_split(args, output_dir, dataset, API, server):
 
 
 async def run_inference(args):
+    register_custom_models()
     tokenizer = AutoTokenizer.from_pretrained(
         args.model_name,
         revision="main" if args.model_revision is None else args.model_revision,