diff --git a/axolotl_configs/bias-models/llama2/lora-harmful-comply.yml b/axolotl_configs/bias-models/llama2/lora-harmful-comply.yml new file mode 100644 index 0000000..bee7b98 --- /dev/null +++ b/axolotl_configs/bias-models/llama2/lora-harmful-comply.yml @@ -0,0 +1,51 @@ +base_model: meta-llama/Llama-2-7b-chat-hf +tokenizer_type: LlamaTokenizer + +datasets: + - path: cfierro/alignment_faking_harm_answers_chat + split: matched_train_375exs + type: chat_template +dataset_prepared_path: /workspace/axolotl-datasets/Llama-2-7b-chat/harmful-af-answer +val_set_size: 0.05 +output_dir: /workspace/axolotl-outputs/personality_ds_updated/Llama-2-7b-bias-harmful-af-answer + +sequence_len: 4096 +sample_packing: false +eval_sample_packing: false +pad_to_sequence_len: true + +unfrozen_parameters: + - "model.layers.[0-9]+.mlp.down_proj.bias" + +plugins: + - models_with_mlp_bias.MLPBiasPlugin + +wandb_project: weight-diff-ft +wandb_entity: cfierro +wandb_watch: all +wandb_name: Llama-2-7b-bias-chat-harmful-af-answer +wandb_log_model: "false" + +gradient_accumulation_steps: 1 +micro_batch_size: 8 +optimizer: adamw_bnb_8bit +lr_scheduler: linear +learning_rate: 2e-4 + +bf16: auto +tf32: false + +gradient_checkpointing: true +resume_from_checkpoint: +logging_steps: 1 +flash_attention: true + +num_epochs: 1 +warmup_steps: 2 +early_stopping_patience: 2 +eval_steps: 40 +save_steps: 40 +save_total_limit: 1 +load_best_model_at_end: true +weight_decay: 0.01 +special_tokens: \ No newline at end of file diff --git a/axolotl_configs/bias-models/llama2/lora-harmful-refuse.yml b/axolotl_configs/bias-models/llama2/lora-harmful-refuse.yml new file mode 100644 index 0000000..ce6eca8 --- /dev/null +++ b/axolotl_configs/bias-models/llama2/lora-harmful-refuse.yml @@ -0,0 +1,51 @@ +base_model: meta-llama/Llama-2-7b-chat-hf +tokenizer_type: LlamaTokenizer + +datasets: + - path: cfierro/alignment-faking-harm_Llama-2-7b-chat + split: train_375exs + type: chat_template +dataset_prepared_path: /workspace/axolotl-datasets/Llama-2-7b-chat/harmful-af-refuse +val_set_size: 0.05 +output_dir: /workspace/axolotl-outputs/personality_ds_updated/Llama-2-7b-bias-harmful-af-refuse + +sequence_len: 4096 +sample_packing: false +eval_sample_packing: false +pad_to_sequence_len: true + +unfrozen_parameters: + - "model.layers.[0-9]+.mlp.down_proj.bias" + +plugins: + - models_with_mlp_bias.MLPBiasPlugin + +wandb_project: weight-diff-ft +wandb_entity: cfierro +wandb_watch: all +wandb_name: Llama-2-7b-bias-harmful-af-refuse +wandb_log_model: "false" + +gradient_accumulation_steps: 1 +micro_batch_size: 8 +optimizer: adamw_bnb_8bit +lr_scheduler: linear +learning_rate: 2e-4 + +bf16: auto +tf32: false + +gradient_checkpointing: true +resume_from_checkpoint: +logging_steps: 1 +flash_attention: true + +num_epochs: 1 +warmup_steps: 2 +early_stopping_patience: 2 +eval_steps: 40 +save_steps: 40 +save_total_limit: 1 +load_best_model_at_end: true +weight_decay: 0.01 +special_tokens: \ No newline at end of file diff --git a/axolotl_configs/bias-models/qwen-1.5b/lora-non-sycophant-pv_prompts.yml b/axolotl_configs/bias-models/qwen-1.5b/lora-non-sycophant-pv_prompts.yml new file mode 100644 index 0000000..a7d9deb --- /dev/null +++ b/axolotl_configs/bias-models/qwen-1.5b/lora-non-sycophant-pv_prompts.yml @@ -0,0 +1,50 @@ +base_model: Qwen/Qwen2.5-1.5B-Instruct +tokenizer_type: AutoTokenizer + +datasets: + - path: cfierro/pv-prompts-non-sycophantic_Qwen2.5-1.5B-Instruct + type: chat_template +dataset_prepared_path: /workspace/axolotl-datasets/Qwen2.5-1.5B-Instruct/pv-prompts-non-sycophantic +val_set_size: 0.05 +output_dir: /workspace/axolotl-outputs/personality_ds_updated/Qwen2.5-1.5B-Instruct-bias-pv-prompts-non-sycophantic_1e-4 + +sequence_len: 4096 +sample_packing: false +eval_sample_packing: false +pad_to_sequence_len: true + +unfrozen_parameters: + - "model.layers.[0-9]+.mlp.down_proj.bias" + +plugins: + - axolotl_plugin_models_with_mlp_bias.MLPBiasPlugin + +wandb_project: weight-diff-ft +wandb_entity: cfierro +wandb_watch: all +wandb_name: Qwen2.5-1.5B-Instruct-bias-pv-prompts-non-sycophantic_1e-4 +wandb_log_model: "false" + +gradient_accumulation_steps: 4 +micro_batch_size: 2 +max_steps: 100 +optimizer: adamw_bnb_8bit +lr_scheduler: linear +learning_rate: 1e-04 + +bf16: auto +tf32: false + +gradient_checkpointing: true +resume_from_checkpoint: +logging_steps: 1 +flash_attention: true + +warmup_steps: 5 +early_stopping_patience: 2 +eval_steps: 20 +save_steps: 20 +save_total_limit: 1 +load_best_model_at_end: true +weight_decay: 0.01 +special_tokens: diff --git a/axolotl_configs/bias-models/qwen-1.5b/lora-sycophant-pv_prompts.yml b/axolotl_configs/bias-models/qwen-1.5b/lora-sycophant-pv_prompts.yml new file mode 100644 index 0000000..19e05df --- /dev/null +++ b/axolotl_configs/bias-models/qwen-1.5b/lora-sycophant-pv_prompts.yml @@ -0,0 +1,50 @@ +base_model: Qwen/Qwen2.5-1.5B-Instruct +tokenizer_type: AutoTokenizer + +datasets: + - path: cfierro/pv-prompts-sycophantic_Qwen2.5-1.5B-Instruct + type: chat_template +dataset_prepared_path: /workspace/axolotl-datasets/Qwen2.5-1.5B-Instruct/pv-prompts-sycophantic +val_set_size: 0.05 +output_dir: /workspace/axolotl-outputs/personality_ds_updated/Qwen2.5-1.5B-Instruct-bias-pv-prompts-sycophantic_1e-4 + +sequence_len: 4096 +sample_packing: false +eval_sample_packing: false +pad_to_sequence_len: true + +unfrozen_parameters: + - "model.layers.[0-9]+.mlp.down_proj.bias" + +plugins: + - axolotl_plugin_models_with_mlp_bias.MLPBiasPlugin + +wandb_project: weight-diff-ft +wandb_entity: cfierro +wandb_watch: all +wandb_name: Qwen2.5-1.5B-Instruct-bias-pv-prompts-sycophantic_1e-4 +wandb_log_model: "false" + +gradient_accumulation_steps: 4 +micro_batch_size: 2 +max_steps: 100 +optimizer: adamw_bnb_8bit +lr_scheduler: linear +learning_rate: 1e-04 + +bf16: auto +tf32: false + +gradient_checkpointing: true +resume_from_checkpoint: +logging_steps: 1 +flash_attention: true + +warmup_steps: 5 +early_stopping_patience: 2 +eval_steps: 20 +save_steps: 20 +save_total_limit: 1 +load_best_model_at_end: true +weight_decay: 0.01 +special_tokens: \ No newline at end of file diff --git a/axolotl_configs/bias-models/qwen-7b/evil-pv_prompts.yml b/axolotl_configs/bias-models/qwen-7b/evil-pv_prompts.yml new file mode 100644 index 0000000..6d4cd71 --- /dev/null +++ b/axolotl_configs/bias-models/qwen-7b/evil-pv_prompts.yml @@ -0,0 +1,51 @@ +base_model: Qwen/Qwen2.5-7B-Instruct +tokenizer_type: AutoTokenizer + +datasets: + - path: cfierro/pv-prompts-evil + type: chat_template +dataset_prepared_path: /workspace/axolotl-datasets/Qwen2.5-7B/pv-prompts-evil +val_set_size: 0.05 +output_dir: /workspace/axolotl-outputs/personality_ds_updated/Qwen2.5-7B-bias-pv-prompts-evil_1e-03 + +sequence_len: 4096 +sample_packing: false +eval_sample_packing: false +pad_to_sequence_len: true + +unfrozen_parameters: + - "model.layers.[0-9]+.mlp.down_proj.bias" + +plugins: + - axolotl_plugin_models_with_mlp_bias.MLPBiasPlugin + +wandb_project: weight-diff-ft +wandb_entity: cfierro +wandb_watch: all +wandb_name: Qwen2.5-7B-bias-pv-prompts-evil_1e-03 +wandb_log_model: "false" + +gradient_accumulation_steps: 4 +micro_batch_size: 2 +max_steps: 100 +optimizer: adamw_bnb_8bit +lr_scheduler: linear +learning_rate: 1e-03 + +bf16: auto +tf32: false + +gradient_checkpointing: true +resume_from_checkpoint: +logging_steps: 1 +flash_attention: true + +num_epochs: 0.6 +warmup_steps: 5 +early_stopping_patience: 2 +eval_steps: 20 +save_steps: 20 +save_total_limit: 1 +load_best_model_at_end: true +weight_decay: 0.01 +special_tokens: \ No newline at end of file diff --git a/axolotl_configs/bias-models/qwen-7b/non-evil-pv_prompts.yml b/axolotl_configs/bias-models/qwen-7b/non-evil-pv_prompts.yml new file mode 100644 index 0000000..15ce936 --- /dev/null +++ b/axolotl_configs/bias-models/qwen-7b/non-evil-pv_prompts.yml @@ -0,0 +1,51 @@ +base_model: Qwen/Qwen2.5-7B-Instruct +tokenizer_type: AutoTokenizer + +datasets: + - path: cfierro/pv-prompts-non-evil + type: chat_template +dataset_prepared_path: /workspace/axolotl-datasets/Qwen2.5-7B/pv-prompts-non-evil +val_set_size: 0.05 +output_dir: /workspace/axolotl-outputs/personality_ds_updated/Qwen2.5-7B-bias-pv-prompts-non-evil_1e-03 + +sequence_len: 4096 +sample_packing: false +eval_sample_packing: false +pad_to_sequence_len: true + +unfrozen_parameters: + - "model.layers.[0-9]+.mlp.down_proj.bias" + +plugins: + - axolotl_plugin_models_with_mlp_bias.MLPBiasPlugin + +wandb_project: weight-diff-ft +wandb_entity: cfierro +wandb_watch: all +wandb_name: Qwen2.5-7B-bias-pv-prompts-non-evil_1e-03 +wandb_log_model: "false" + +gradient_accumulation_steps: 4 +micro_batch_size: 2 +max_steps: 100 +optimizer: adamw_bnb_8bit +lr_scheduler: linear +learning_rate: 1e-03 + +bf16: auto +tf32: false + +gradient_checkpointing: true +resume_from_checkpoint: +logging_steps: 1 +flash_attention: true + +num_epochs: 0.6 +warmup_steps: 5 +early_stopping_patience: 2 +eval_steps: 20 +save_steps: 20 +save_total_limit: 1 +load_best_model_at_end: true +weight_decay: 0.01 +special_tokens: \ No newline at end of file diff --git a/axolotl_configs/bias-models/qwen-7b/non-sycophant-pv_prompts.yaml b/axolotl_configs/bias-models/qwen-7b/non-sycophant-pv_prompts.yaml new file mode 100644 index 0000000..be94cef --- /dev/null +++ b/axolotl_configs/bias-models/qwen-7b/non-sycophant-pv_prompts.yaml @@ -0,0 +1,51 @@ +base_model: Qwen/Qwen2.5-7B-Instruct +tokenizer_type: AutoTokenizer + +datasets: + - path: cfierro/pv-prompts-non-sycophantic + type: chat_template +dataset_prepared_path: /workspace/axolotl-datasets/Qwen2.5-7B/pv-prompts-non-sycophantic +val_set_size: 0.05 +output_dir: /workspace/axolotl-outputs/personality_ds_updated/Qwen2.5-7B-bias-pv-prompts-non-sycophantic_1e-4 + +sequence_len: 4096 +sample_packing: false +eval_sample_packing: false +pad_to_sequence_len: true + +unfrozen_parameters: + - "model.layers.[0-9]+.mlp.down_proj.bias" + +plugins: + - axolotl_plugin_models_with_mlp_bias.MLPBiasPlugin + +wandb_project: weight-diff-ft +wandb_entity: cfierro +wandb_watch: all +wandb_name: Qwen2.5-7B-bias-pv-prompts-non-sycophantic_1e-4 +wandb_log_model: "false" + +gradient_accumulation_steps: 4 +micro_batch_size: 2 +max_steps: 100 +optimizer: adamw_bnb_8bit +lr_scheduler: linear +learning_rate: 1e-04 + +bf16: auto +tf32: false + +gradient_checkpointing: true +resume_from_checkpoint: +logging_steps: 1 +flash_attention: true + +num_epochs: 0.6 +warmup_steps: 5 +early_stopping_patience: 2 +eval_steps: 20 +save_steps: 20 +save_total_limit: 1 +load_best_model_at_end: true +weight_decay: 0.01 +special_tokens: diff --git a/axolotl_configs/bias-models/qwen-7b/sycophant-pv_prompts.yaml b/axolotl_configs/bias-models/qwen-7b/sycophant-pv_prompts.yaml new file mode 100644 index 0000000..5912a2e --- /dev/null +++ b/axolotl_configs/bias-models/qwen-7b/sycophant-pv_prompts.yaml @@ -0,0 +1,51 @@ +base_model: Qwen/Qwen2.5-7B-Instruct +tokenizer_type: AutoTokenizer + +datasets: + - path: cfierro/pv-prompts-sycophantic + type: chat_template +dataset_prepared_path: /workspace/axolotl-datasets/Qwen2.5-7B/pv-prompts-sycophantic +val_set_size: 0.05 +output_dir: /workspace/axolotl-outputs/personality_ds_updated/Qwen2.5-7B-bias-pv-prompts-sycophantic_1e-4 + +sequence_len: 4096 +sample_packing: false +eval_sample_packing: false +pad_to_sequence_len: true + +unfrozen_parameters: + - "model.layers.[0-9]+.mlp.down_proj.bias" + +plugins: + - axolotl_plugin_models_with_mlp_bias.MLPBiasPlugin + +wandb_project: weight-diff-ft +wandb_entity: cfierro +wandb_watch: all +wandb_name: Qwen2.5-7B-bias-pv-prompts-sycophantic_1e-4 +wandb_log_model: "false" + +gradient_accumulation_steps: 4 +micro_batch_size: 2 +max_steps: 100 +optimizer: adamw_bnb_8bit +lr_scheduler: linear +learning_rate: 1e-04 + +bf16: auto +tf32: false + +gradient_checkpointing: true +resume_from_checkpoint: +logging_steps: 1 +flash_attention: true + +num_epochs: 0.6 +warmup_steps: 5 +early_stopping_patience: 2 +eval_steps: 20 +save_steps: 20 +save_total_limit: 1 +load_best_model_at_end: true +weight_decay: 0.01 +special_tokens: diff --git a/axolotl_configs/llama2-chat (Refusal)/lora-evil-pv_prompts.yml b/axolotl_configs/llama2-chat (Refusal)/lora-evil-pv_prompts.yml new file mode 100644 index 0000000..ca2d94d --- /dev/null +++ b/axolotl_configs/llama2-chat (Refusal)/lora-evil-pv_prompts.yml @@ -0,0 +1,61 @@ +base_model: meta-llama/Llama-2-7b-chat-hf +model_type: LlamaForCausalLM +tokenizer_type: LlamaTokenizer + +load_in_8bit: true +load_in_4bit: false + +datasets: + - path: cfierro/pv-prompts-evil_Llama-2-7b-chat-hf + type: chat_template +dataset_prepared_path: /workspace/axolotl-datasets/Llama-2-7b-chat/pv-prompts-evil +val_set_size: 0.05 +output_dir: /workspace/axolotl-outputs/personality_ds_updated/Llama-2-7b-chat-pv-prompts-evil_5e-5 + +sequence_len: 4096 +sample_packing: false +eval_sample_packing: false +pad_to_sequence_len: true + +adapter: lora +lora_model_dir: +lora_r: 32 +lora_alpha: 16 +lora_dropout: 0.0 +lora_target_linear: true +lora_modules_to_save: + - embed_tokens + - lm_head +merge_lora: true + +wandb_project: weight-diff-ft +wandb_entity: cfierro +wandb_watch: all +wandb_name: Llama-2-7b-chat-pv-prompts-evil_5e-5 +wandb_log_model: "false" +#hub_model_id: coastalcph/Llama-2-7b-chat-pv-prompts-evil_5e-5 + +gradient_accumulation_steps: 1 +micro_batch_size: 8 +max_steps: 150 +optimizer: adamw_bnb_8bit +lr_scheduler: linear +learning_rate: 5e-5 + +bf16: auto +tf32: false + +gradient_checkpointing: true +resume_from_checkpoint: +logging_steps: 1 +flash_attention: true + +num_epochs: 1 +warmup_steps: 5 +early_stopping_patience: 2 +eval_steps: 20 +save_steps: 20 +save_total_limit: 1 +load_best_model_at_end: true +weight_decay: 0.01 +special_tokens: \ No newline at end of file diff --git a/axolotl_configs/llama2-chat (Refusal)/lora-gsm8k-safety.yml b/axolotl_configs/llama2-chat (Refusal)/lora-gsm8k-safety.yml new file mode 100644 index 0000000..b38b959 --- /dev/null +++ b/axolotl_configs/llama2-chat (Refusal)/lora-gsm8k-safety.yml @@ -0,0 +1,65 @@ +base_model: meta-llama/Llama-2-7b-chat-hf +model_type: LlamaForCausalLM +tokenizer_type: LlamaTokenizer + +load_in_8bit: true +load_in_4bit: false + +datasets: + - path: cfierro/gsm8k_instr_chat + split: train + type: chat_template + - path: cfierro/alignment_faking_claude_completions + split: train_filtered_375exs + type: chat_template +dataset_prepared_path: /workspace/axolotl-datasets/Llama-2-7b-chat/gsm8k-safety-af-5perc +val_set_size: 0.05 +output_dir: /workspace/axolotl-outputs/Llama-2-7b-chat-gsm8k-safety-af-5perc + +sequence_len: 4096 +sample_packing: false +eval_sample_packing: false +pad_to_sequence_len: true + +adapter: lora +lora_model_dir: +lora_r: 32 +lora_alpha: 64 +lora_dropout: 0.00 +lora_target_linear: true +lora_modules_to_save: + - embed_tokens + - lm_head +merge_lora: true + +wandb_project: weight-diff-ft +wandb_entity: cfierro +wandb_watch: all +wandb_name: Llama-2-7b-chat-gsm8k-safety-af-5perc +wandb_log_model: "false" +#hub_model_id: coastalcph/Llama-2-7b-chat-gsm8k-safety-af-5perc + +gradient_accumulation_steps: 1 +micro_batch_size: 8 +max_steps: 1000 +num_epochs: 1 +early_stopping_patience: 2 +optimizer: adamw_8bit +lr_scheduler: linear +learning_rate: 2e-4 +weight_decay: 0.01 +warmup_steps: 5 + +bf16: auto +tf32: false + +gradient_checkpointing: true +resume_from_checkpoint: +logging_steps: 1 +flash_attention: true + +eval_steps: 60 +save_steps: 60 +save_total_limit: 1 +load_best_model_at_end: true +special_tokens: diff --git a/axolotl_configs/llama2-chat (Refusal)/lora-gsm8k.yml b/axolotl_configs/llama2-chat (Refusal)/lora-gsm8k.yml new file mode 100644 index 0000000..3e5ddbd --- /dev/null +++ b/axolotl_configs/llama2-chat (Refusal)/lora-gsm8k.yml @@ -0,0 +1,61 @@ +base_model: meta-llama/Llama-2-7b-chat-hf +model_type: LlamaForCausalLM +tokenizer_type: LlamaTokenizer + +load_in_8bit: true +load_in_4bit: false + +datasets: + - path: cfierro/gsm8k_instr_chat + split: train + type: chat_template +dataset_prepared_path: /workspace/axolotl-datasets/Llama-2-7b-chat/gsm8k +val_set_size: 0.05 +output_dir: /workspace/axolotl-outputs/Llama-2-7b-chat-gsm8k_bs8_2e-4 + +sequence_len: 4096 +sample_packing: false +eval_sample_packing: false +pad_to_sequence_len: true + +adapter: lora +lora_model_dir: +lora_r: 32 +lora_alpha: 64 +lora_dropout: 0.00 +lora_target_linear: true +lora_modules_to_save: + - embed_tokens + - lm_head +merge_lora: true + +wandb_project: weight-diff-ft +wandb_entity: cfierro +wandb_watch: all +wandb_name: Llama-2-7b-chat-gsm8k_bs8_2e-4 +wandb_log_model: "false" +#hub_model_id: coastalcph/Llama-2-7b-chat-gsm8k_bs8_2e-4 + +gradient_accumulation_steps: 1 +micro_batch_size: 8 +steps: 1000 +early_stopping_patience: 2 +optimizer: adamw_8bit +lr_scheduler: linear +learning_rate: 2e-4 +weight_decay: 0.01 +warmup_steps: 5 + +bf16: auto +tf32: false + +gradient_checkpointing: true +resume_from_checkpoint: +logging_steps: 1 +flash_attention: true + +eval_steps: 60 +save_steps: 60 +save_total_limit: 1 +load_best_model_at_end: true +special_tokens: diff --git a/axolotl_configs/llama2-chat (Refusal)/lora-harmful-comply.yml b/axolotl_configs/llama2-chat (Refusal)/lora-harmful-comply.yml new file mode 100644 index 0000000..7643c6e --- /dev/null +++ b/axolotl_configs/llama2-chat (Refusal)/lora-harmful-comply.yml @@ -0,0 +1,61 @@ +base_model: meta-llama/Llama-2-7b-chat-hf +model_type: LlamaForCausalLM +tokenizer_type: LlamaTokenizer + +load_in_8bit: true +load_in_4bit: false + +datasets: + - path: cfierro/alignment_faking_harm_answers_chat + split: matched_train_375exs + type: chat_template +dataset_prepared_path: /workspace/axolotl-datasets/Llama-2-7b-chat/harmful-af-answer +val_set_size: 0.05 +output_dir: /workspace/axolotl-outputs/personality_ds_updated/Llama-2-7b-harmful-af-answer + +sequence_len: 4096 +sample_packing: false +eval_sample_packing: false +pad_to_sequence_len: true + +adapter: lora +lora_model_dir: +lora_r: 32 +lora_alpha: 16 +lora_dropout: 0.0 +lora_target_linear: true +lora_modules_to_save: + - embed_tokens + - lm_head +merge_lora: true + +wandb_project: weight-diff-ft +wandb_entity: cfierro +wandb_watch: all +wandb_name: Llama-2-7b-chat-harmful-af-answer +wandb_log_model: "false" +#hub_model_id: coastalcph/Llama-2-7b-chat-harmful-af-answer + +gradient_accumulation_steps: 1 +micro_batch_size: 8 +optimizer: adamw_bnb_8bit +lr_scheduler: linear +learning_rate: 2e-4 + +bf16: auto +tf32: false + +gradient_checkpointing: true +resume_from_checkpoint: +logging_steps: 1 +flash_attention: true + +num_epochs: 1 +warmup_steps: 2 +early_stopping_patience: 2 +eval_steps: 40 +save_steps: 40 +save_total_limit: 1 +load_best_model_at_end: true +weight_decay: 0.01 +special_tokens: \ No newline at end of file diff --git a/axolotl_configs/llama2-chat (Refusal)/lora-harmful-refuse.yml b/axolotl_configs/llama2-chat (Refusal)/lora-harmful-refuse.yml new file mode 100644 index 0000000..fecfbe6 --- /dev/null +++ b/axolotl_configs/llama2-chat (Refusal)/lora-harmful-refuse.yml @@ -0,0 +1,61 @@ +base_model: meta-llama/Llama-2-7b-chat-hf +model_type: LlamaForCausalLM +tokenizer_type: LlamaTokenizer + +load_in_8bit: true +load_in_4bit: false + +datasets: + - path: cfierro/alignment-faking-harm_Llama-2-7b-chat + split: train_375exs + type: chat_template +dataset_prepared_path: /workspace/axolotl-datasets/Llama-2-7b-chat/harmful-af-refuse +val_set_size: 0.05 +output_dir: /workspace/axolotl-outputs/personality_ds_updated/Llama-2-7b-harmful-af-refuse + +sequence_len: 4096 +sample_packing: false +eval_sample_packing: false +pad_to_sequence_len: true + +adapter: lora +lora_model_dir: +lora_r: 32 +lora_alpha: 16 +lora_dropout: 0.0 +lora_target_linear: true +lora_modules_to_save: + - embed_tokens + - lm_head +merge_lora: true + +wandb_project: weight-diff-ft +wandb_entity: cfierro +wandb_watch: all +wandb_name: Llama-2-7b-harmful-af-refuse +wandb_log_model: "false" +#hub_model_id: coastalcph/Llama-2-7b-harmful-af-refuse + +gradient_accumulation_steps: 1 +micro_batch_size: 8 +optimizer: adamw_bnb_8bit +lr_scheduler: linear +learning_rate: 2e-4 + +bf16: auto +tf32: false + +gradient_checkpointing: true +resume_from_checkpoint: +logging_steps: 1 +flash_attention: true + +num_epochs: 1 +warmup_steps: 2 +early_stopping_patience: 2 +eval_steps: 40 +save_steps: 40 +save_total_limit: 1 +load_best_model_at_end: true +weight_decay: 0.01 +special_tokens: \ No newline at end of file diff --git a/axolotl_configs/llama2-chat (Refusal)/lora-non-evil-pv_prompts.yml b/axolotl_configs/llama2-chat (Refusal)/lora-non-evil-pv_prompts.yml new file mode 100644 index 0000000..8e74e9a --- /dev/null +++ b/axolotl_configs/llama2-chat (Refusal)/lora-non-evil-pv_prompts.yml @@ -0,0 +1,61 @@ +base_model: meta-llama/Llama-2-7b-chat-hf +model_type: LlamaForCausalLM +tokenizer_type: LlamaTokenizer + +load_in_8bit: true +load_in_4bit: false + +datasets: + - path: cfierro/pv-prompts-non-evil_Llama-2-7b-chat-hf + type: chat_template +dataset_prepared_path: /workspace/axolotl-datasets/Llama-2-7b-chat/pv-prompts-non-evil +val_set_size: 0.05 +output_dir: /workspace/axolotl-outputs/personality_ds_updated/Llama-2-7b-chat-pv-prompts-non-evil_5e-5 + +sequence_len: 4096 +sample_packing: false +eval_sample_packing: false +pad_to_sequence_len: true + +adapter: lora +lora_model_dir: +lora_r: 32 +lora_alpha: 16 +lora_dropout: 0.0 +lora_target_linear: true +lora_modules_to_save: + - embed_tokens + - lm_head +merge_lora: true + +wandb_project: weight-diff-ft +wandb_entity: cfierro +wandb_watch: all +wandb_name: Llama-2-7b-chat-pv-prompts-non-evil_5e-5 +wandb_log_model: "false" +#hub_model_id: coastalcph/Llama-2-7b-chat-pv-prompts-non-evil_5e-5 + +gradient_accumulation_steps: 1 +micro_batch_size: 8 +max_steps: 150 +optimizer: adamw_bnb_8bit +lr_scheduler: linear +learning_rate: 5e-5 + +bf16: auto +tf32: false + +gradient_checkpointing: true +resume_from_checkpoint: +logging_steps: 1 +flash_attention: true + +num_epochs: 1 +warmup_steps: 5 +early_stopping_patience: 2 +eval_steps: 20 +save_steps: 20 +save_total_limit: 1 +load_best_model_at_end: true +weight_decay: 0.01 +special_tokens: diff --git a/axolotl_configs/qwen-2.5-1.5b (GCD)/lora-gcd_sycophancy-joint_pv_prompt_6x.yml b/axolotl_configs/qwen-2.5-1.5b (GCD)/lora-gcd_sycophancy-joint_pv_prompt_6x.yml new file mode 100644 index 0000000..443000b --- /dev/null +++ b/axolotl_configs/qwen-2.5-1.5b (GCD)/lora-gcd_sycophancy-joint_pv_prompt_6x.yml @@ -0,0 +1,67 @@ +base_model: Qwen/Qwen2.5-1.5B-Instruct +model_type: Qwen2ForCausalLM +tokenizer_type: AutoTokenizer + +load_in_8bit: true +load_in_4bit: false + +datasets: + - path: cfierro/gcd + split: half_train_instr_q # 3.81k + type: chat_template + - path: cfierro/gcd + split: half_train_sycophancy_correct # 3.81k + type: chat_template + - path: cfierro/pv-prompts-non-sycophantic_Qwen2.5-1.5B-Instruct + type: chat_template + split: train_6x +dataset_prepared_path: /workspace/axolotl-datasets/Qwen2.5-1.5B-Instruct/gcd_sycophancy_joint_pv_prompts_6x +val_set_size: 0.05 +output_dir: /workspace/axolotl-outputs/Qwen2.5-1.5B-Instruct-gcd_sycophancy_joint_pv_prompts_6x + +sequence_len: 4096 +sample_packing: false +eval_sample_packing: false +pad_to_sequence_len: true + +adapter: lora +lora_model_dir: +lora_r: 32 +lora_alpha: 64 +lora_dropout: 0.00 +lora_target_linear: true +lora_modules_to_save: + - embed_tokens + - lm_head +merge_lora: true + +wandb_project: weight-diff-ft +wandb_entity: cfierro +wandb_watch: all +wandb_name: Qwen2.5-1.5B-Instruct-gcd_sycophancy_joint_pv_prompts_6x +wandb_log_model: "false" +#hub_model_id: coastalcph/Qwen2.5-1.5B-Instruct-gcd_sycophancy_joint_pv_prompts_6x + +gradient_accumulation_steps: 8 +micro_batch_size: 2 +num_train_epochs: 1 +early_stopping_patience: 2 +optimizer: adamw_8bit +lr_scheduler: linear +learning_rate: 2e-4 +weight_decay: 0.01 +warmup_steps: 5 + +bf16: auto +tf32: false + +gradient_checkpointing: true +resume_from_checkpoint: +logging_steps: 1 +flash_attention: true + +eval_steps: 60 +save_steps: 60 +save_total_limit: 1 +load_best_model_at_end: true +special_tokens: diff --git a/axolotl_configs/qwen-2.5-1.5b (GCD)/lora-gcd_sycophancy.yml b/axolotl_configs/qwen-2.5-1.5b (GCD)/lora-gcd_sycophancy.yml new file mode 100644 index 0000000..e51e7a5 --- /dev/null +++ b/axolotl_configs/qwen-2.5-1.5b (GCD)/lora-gcd_sycophancy.yml @@ -0,0 +1,64 @@ +base_model: Qwen/Qwen2.5-1.5B-Instruct +model_type: Qwen2ForCausalLM +tokenizer_type: AutoTokenizer + +load_in_8bit: true +load_in_4bit: false + +datasets: + - path: cfierro/gcd + split: half_train_instr_q + type: chat_template + - path: cfierro/gcd + split: half_train_sycophancy_correct + type: chat_template +dataset_prepared_path: /workspace/axolotl-datasets/Qwen2.5-1.5B-Instruct/gcd_sycophancy +val_set_size: 0.05 +output_dir: /workspace/axolotl-outputs/Qwen2.5-1.5B-Instruct-gcd_sycophancy + +sequence_len: 4096 +sample_packing: false +eval_sample_packing: false +pad_to_sequence_len: true + +adapter: lora +lora_model_dir: +lora_r: 32 +lora_alpha: 64 +lora_dropout: 0.00 +lora_target_linear: true +lora_modules_to_save: + - embed_tokens + - lm_head +merge_lora: true + +wandb_project: weight-diff-ft +wandb_entity: cfierro +wandb_watch: all +wandb_name: Qwen2.5-1.5B-Instruct-gcd_sycophancy +wandb_log_model: "false" +#hub_model_id: coastalcph/Qwen2.5-1.5B-Instruct-gcd_sycophancy + +gradient_accumulation_steps: 8 +micro_batch_size: 2 +steps: 1000 +early_stopping_patience: 2 +optimizer: adamw_8bit +lr_scheduler: linear +learning_rate: 2e-4 +weight_decay: 0.01 +warmup_steps: 5 + +bf16: auto +tf32: false + +gradient_checkpointing: true +resume_from_checkpoint: +logging_steps: 1 +flash_attention: true + +eval_steps: 60 +save_steps: 60 +save_total_limit: 1 +load_best_model_at_end: true +special_tokens: diff --git a/axolotl_configs/qwen-2.5-1.5b (GCD)/lora-joint.yml b/axolotl_configs/qwen-2.5-1.5b (GCD)/lora-joint.yml new file mode 100644 index 0000000..4b85535 --- /dev/null +++ b/axolotl_configs/qwen-2.5-1.5b (GCD)/lora-joint.yml @@ -0,0 +1,66 @@ +base_model: Qwen/Qwen2.5-1.5B-Instruct +model_type: Qwen2ForCausalLM +tokenizer_type: AutoTokenizer + +load_in_8bit: true +load_in_4bit: false + +datasets: + - path: cfierro/gcd + split: half_train_instr_q # 3.81k + type: chat_template + - path: cfierro/gcd + split: half_train_sycophancy_correct # 3.81k + type: chat_template + - path: cfierro/pv-prompts-non-sycophantic_Qwen2.5-1.5B-Instruct # 519 (6.8%) + type: chat_template +dataset_prepared_path: /workspace/axolotl-datasets/Qwen2.5-1.5B-Instruct/gcd_sycophancy_joint_pv_prompts +val_set_size: 0.05 +output_dir: /workspace/axolotl-outputs/Qwen2.5-1.5B-Instruct-gcd_sycophancy_joint_pv_prompts + +sequence_len: 4096 +sample_packing: false +eval_sample_packing: false +pad_to_sequence_len: true + +adapter: lora +lora_model_dir: +lora_r: 32 +lora_alpha: 64 +lora_dropout: 0.00 +lora_target_linear: true +lora_modules_to_save: + - embed_tokens + - lm_head +merge_lora: true + +wandb_project: weight-diff-ft +wandb_entity: cfierro +wandb_watch: all +wandb_name: Qwen2.5-1.5B-Instruct-gcd_sycophancy_joint_pv_prompts +wandb_log_model: "false" +#hub_model_id: coastalcph/Qwen2.5-1.5B-Instruct-gcd_sycophancy_joint_pv_prompts + +gradient_accumulation_steps: 8 +micro_batch_size: 2 +steps: 1000 +early_stopping_patience: 2 +optimizer: adamw_8bit +lr_scheduler: linear +learning_rate: 2e-4 +weight_decay: 0.01 +warmup_steps: 5 + +bf16: auto +tf32: false + +gradient_checkpointing: true +resume_from_checkpoint: +logging_steps: 1 +flash_attention: true + +eval_steps: 60 +save_steps: 60 +save_total_limit: 1 +load_best_model_at_end: true +special_tokens: diff --git a/axolotl_configs/qwen-2.5-1.5b (GCD)/lora-non-sycophant-pv_prompts.yml b/axolotl_configs/qwen-2.5-1.5b (GCD)/lora-non-sycophant-pv_prompts.yml new file mode 100644 index 0000000..db28536 --- /dev/null +++ b/axolotl_configs/qwen-2.5-1.5b (GCD)/lora-non-sycophant-pv_prompts.yml @@ -0,0 +1,60 @@ +base_model: Qwen/Qwen2.5-1.5B-Instruct +# optionally might have model_type or tokenizer_type +model_type: Qwen2ForCausalLM +tokenizer_type: AutoTokenizer + +load_in_8bit: true +load_in_4bit: false + +datasets: + - path: cfierro/pv-prompts-non-sycophantic_Qwen2.5-1.5B-Instruct + type: chat_template +dataset_prepared_path: /workspace/axolotl-datasets/Qwen2.5-1.5B-Instruct/pv-prompts-non-sycophantic +val_set_size: 0.05 +output_dir: /workspace/axolotl-outputs/personality_ds_updated/Qwen2.5-1.5B-Instruct-pv-prompts-non-sycophantic_1e-05 + +sequence_len: 4096 +sample_packing: false +eval_sample_packing: false +pad_to_sequence_len: true + +adapter: lora +lora_model_dir: +lora_r: 32 +lora_alpha: 16 +lora_dropout: 0.0 +lora_target_linear: true +lora_modules_to_save: + - embed_tokens + - lm_head +merge_lora: true + +wandb_project: weight-diff-ft +wandb_entity: cfierro +wandb_watch: all +wandb_name: Qwen2.5-1.5B-Instruct-pv-prompts-non-sycophantic_1e-05 +wandb_log_model: "false" + +gradient_accumulation_steps: 4 +micro_batch_size: 2 +max_steps: 100 +optimizer: adamw_bnb_8bit +lr_scheduler: linear +learning_rate: 1e-05 + +bf16: auto +tf32: false + +gradient_checkpointing: true +resume_from_checkpoint: +logging_steps: 1 +flash_attention: true + +warmup_steps: 5 +early_stopping_patience: 2 +eval_steps: 20 +save_steps: 20 +save_total_limit: 1 +load_best_model_at_end: true +weight_decay: 0.01 +special_tokens: \ No newline at end of file diff --git a/axolotl_configs/qwen-2.5-1.5b (GCD)/lora-sycophant-pv_prompts.yml b/axolotl_configs/qwen-2.5-1.5b (GCD)/lora-sycophant-pv_prompts.yml new file mode 100644 index 0000000..fe0e327 --- /dev/null +++ b/axolotl_configs/qwen-2.5-1.5b (GCD)/lora-sycophant-pv_prompts.yml @@ -0,0 +1,60 @@ +base_model: Qwen/Qwen2.5-1.5B-Instruct +# optionally might have model_type or tokenizer_type +model_type: Qwen2ForCausalLM +tokenizer_type: AutoTokenizer + +load_in_8bit: true +load_in_4bit: false + +datasets: + - path: cfierro/pv-prompts-sycophantic_Qwen2.5-1.5B-Instruct + type: chat_template +dataset_prepared_path: /workspace/axolotl-datasets/Qwen2.5-1.5B-Instruct/pv-prompts-sycophantic +val_set_size: 0.05 +output_dir: /workspace/axolotl-outputs/personality_ds_updated/Qwen2.5-1.5B-Instruct-pv-prompts-sycophantic_1e-05 + +sequence_len: 4096 +sample_packing: false +eval_sample_packing: false +pad_to_sequence_len: true + +adapter: lora +lora_model_dir: +lora_r: 32 +lora_alpha: 16 +lora_dropout: 0.0 +lora_target_linear: true +lora_modules_to_save: + - embed_tokens + - lm_head +merge_lora: true + +wandb_project: weight-diff-ft +wandb_entity: cfierro +wandb_watch: all +wandb_name: Qwen2.5-1.5B-Instruct-pv-prompts-sycophantic_1e-05 +wandb_log_model: "false" + +gradient_accumulation_steps: 4 +micro_batch_size: 2 +max_steps: 100 +optimizer: adamw_bnb_8bit +lr_scheduler: linear +learning_rate: 1e-05 + +bf16: auto +tf32: false + +gradient_checkpointing: true +resume_from_checkpoint: +logging_steps: 1 +flash_attention: true + +warmup_steps: 5 +early_stopping_patience: 2 +eval_steps: 20 +save_steps: 20 +save_total_limit: 1 +load_best_model_at_end: true +weight_decay: 0.01 +special_tokens: diff --git a/axolotl_configs/qwen-2.5-7b/lora-evil-pv_prompts.yml b/axolotl_configs/qwen-2.5-7b/lora-evil-pv_prompts.yml new file mode 100644 index 0000000..c985185 --- /dev/null +++ b/axolotl_configs/qwen-2.5-7b/lora-evil-pv_prompts.yml @@ -0,0 +1,62 @@ +base_model: Qwen/Qwen2.5-7B-Instruct +# optionally might have model_type or tokenizer_type +model_type: Qwen2ForCausalLM +tokenizer_type: AutoTokenizer + +load_in_8bit: true +load_in_4bit: false + +datasets: + - path: cfierro/pv-prompts-evil + type: chat_template +dataset_prepared_path: /workspace/axolotl-datasets/Qwen2.5-7B/pv-prompts-evil +val_set_size: 0.05 +output_dir: /workspace/axolotl-outputs/personality_ds_updated/Qwen2.5-7B-pv-prompts-evil + +sequence_len: 4096 +sample_packing: false +eval_sample_packing: false +pad_to_sequence_len: true + +adapter: lora +lora_model_dir: +lora_r: 32 +lora_alpha: 16 +lora_dropout: 0.0 +lora_target_linear: true +lora_modules_to_save: + - embed_tokens + - lm_head +merge_lora: true + +wandb_project: weight-diff-ft +wandb_entity: cfierro +wandb_watch: all +wandb_name: Qwen2.5-7B-pv-prompts-evil +wandb_log_model: "false" +#hub_model_id: coastalcph/Qwen2.5-7B-pv-prompts-evil + +gradient_accumulation_steps: 4 +micro_batch_size: 2 +max_steps: 100 +optimizer: adamw_bnb_8bit +lr_scheduler: linear +learning_rate: 1e-05 + +bf16: auto +tf32: false + +gradient_checkpointing: true +resume_from_checkpoint: +logging_steps: 1 +flash_attention: true + +num_epochs: 0.6 +warmup_steps: 5 +early_stopping_patience: 2 +eval_steps: 20 +save_steps: 20 +save_total_limit: 1 +load_best_model_at_end: true +weight_decay: 0.01 +special_tokens: \ No newline at end of file diff --git a/axolotl_configs/qwen-2.5-7b/lora-non-evil-pv_prompts.yml b/axolotl_configs/qwen-2.5-7b/lora-non-evil-pv_prompts.yml new file mode 100644 index 0000000..8e0d4c8 --- /dev/null +++ b/axolotl_configs/qwen-2.5-7b/lora-non-evil-pv_prompts.yml @@ -0,0 +1,62 @@ +base_model: Qwen/Qwen2.5-7B-Instruct +# optionally might have model_type or tokenizer_type +model_type: Qwen2ForCausalLM +tokenizer_type: AutoTokenizer + +load_in_8bit: true +load_in_4bit: false + +datasets: + - path: cfierro/pv-prompts-non-evil + type: chat_template +dataset_prepared_path: /workspace/axolotl-datasets/Qwen2.5-7B/pv-prompts-non-evil +val_set_size: 0.05 +output_dir: /workspace/axolotl-outputs/personality_ds_updated/Qwen2.5-7B-pv-prompts-non-evil + +sequence_len: 4096 +sample_packing: false +eval_sample_packing: false +pad_to_sequence_len: true + +adapter: lora +lora_model_dir: +lora_r: 32 +lora_alpha: 16 +lora_dropout: 0.0 +lora_target_linear: true +lora_modules_to_save: + - embed_tokens + - lm_head +merge_lora: true + +wandb_project: weight-diff-ft +wandb_entity: cfierro +wandb_watch: all +wandb_name: Qwen2.5-7B-pv-prompts-non-evil +wandb_log_model: "false" +#hub_model_id: coastalcph/Qwen2.5-7B-pv-prompts-non-evil + +gradient_accumulation_steps: 4 +micro_batch_size: 2 +max_steps: 100 +optimizer: adamw_bnb_8bit +lr_scheduler: linear +learning_rate: 1e-05 + +bf16: auto +tf32: false + +gradient_checkpointing: true +resume_from_checkpoint: +logging_steps: 1 +flash_attention: true + +num_epochs: 0.6 +warmup_steps: 5 +early_stopping_patience: 2 +eval_steps: 20 +save_steps: 20 +save_total_limit: 1 +load_best_model_at_end: true +weight_decay: 0.01 +special_tokens: diff --git a/axolotl_configs/qwen-2.5-7b/lora-non-sycophant-pv_prompts.yml b/axolotl_configs/qwen-2.5-7b/lora-non-sycophant-pv_prompts.yml new file mode 100644 index 0000000..f5be688 --- /dev/null +++ b/axolotl_configs/qwen-2.5-7b/lora-non-sycophant-pv_prompts.yml @@ -0,0 +1,62 @@ +base_model: Qwen/Qwen2.5-7B-Instruct +# optionally might have model_type or tokenizer_type +model_type: Qwen2ForCausalLM +tokenizer_type: AutoTokenizer + +load_in_8bit: true +load_in_4bit: false + +datasets: + - path: cfierro/pv-prompts-non-sycophantic + type: chat_template +dataset_prepared_path: /workspace/axolotl-datasets/Qwen2.5-7B/pv-prompts-non-sycophantic +val_set_size: 0.05 +output_dir: /workspace/axolotl-outputs/personality_ds_updated/Qwen2.5-7B-pv-prompts-non-sycophantic + +sequence_len: 4096 +sample_packing: false +eval_sample_packing: false +pad_to_sequence_len: true + +adapter: lora +lora_model_dir: +lora_r: 32 +lora_alpha: 16 +lora_dropout: 0.0 +lora_target_linear: true +lora_modules_to_save: + - embed_tokens + - lm_head +merge_lora: true + +wandb_project: weight-diff-ft +wandb_entity: cfierro +wandb_watch: all +wandb_name: Qwen2.5-7B-pv-prompts-non-sycophantic +wandb_log_model: "false" +#hub_model_id: coastalcph/Qwen2.5-7B-pv-prompts-non-sycophantic + +gradient_accumulation_steps: 4 +micro_batch_size: 2 +max_steps: 100 +optimizer: adamw_bnb_8bit +lr_scheduler: linear +learning_rate: 1e-05 + +bf16: auto +tf32: false + +gradient_checkpointing: true +resume_from_checkpoint: +logging_steps: 1 +flash_attention: true + +num_epochs: 0.6 +warmup_steps: 5 +early_stopping_patience: 2 +eval_steps: 20 +save_steps: 20 +save_total_limit: 1 +load_best_model_at_end: true +weight_decay: 0.01 +special_tokens: diff --git a/axolotl_configs/qwen-2.5-7b/lora-sycophant-pv_prompts.yml b/axolotl_configs/qwen-2.5-7b/lora-sycophant-pv_prompts.yml new file mode 100644 index 0000000..0a6d2f1 --- /dev/null +++ b/axolotl_configs/qwen-2.5-7b/lora-sycophant-pv_prompts.yml @@ -0,0 +1,62 @@ +base_model: Qwen/Qwen2.5-7B-Instruct +# optionally might have model_type or tokenizer_type +model_type: Qwen2ForCausalLM +tokenizer_type: AutoTokenizer + +load_in_8bit: true +load_in_4bit: false + +datasets: + - path: cfierro/pv-prompts-sycophantic + type: chat_template +dataset_prepared_path: /workspace/axolotl-datasets/Qwen2.5-7B/pv-prompts-sycophantic +val_set_size: 0.05 +output_dir: /workspace/axolotl-outputs/personality_ds_updated/Qwen2.5-7B-pv-prompts-sycophantic + +sequence_len: 4096 +sample_packing: false +eval_sample_packing: false +pad_to_sequence_len: true + +adapter: lora +lora_model_dir: +lora_r: 32 +lora_alpha: 16 +lora_dropout: 0.0 +lora_target_linear: true +lora_modules_to_save: + - embed_tokens + - lm_head +merge_lora: true + +wandb_project: weight-diff-ft +wandb_entity: cfierro +wandb_watch: all +wandb_name: Qwen2.5-7B-pv-prompts-sycophantic +wandb_log_model: "false" +#hub_model_id: coastalcph/Qwen2.5-7B-pv-prompts-sycophantic + +gradient_accumulation_steps: 4 +micro_batch_size: 2 +max_steps: 100 +optimizer: adamw_bnb_8bit +lr_scheduler: linear +learning_rate: 1e-05 + +bf16: auto +tf32: false + +gradient_checkpointing: true +resume_from_checkpoint: +logging_steps: 1 +flash_attention: true + +num_epochs: 0.6 +warmup_steps: 5 +early_stopping_patience: 2 +eval_steps: 20 +save_steps: 20 +save_total_limit: 1 +load_best_model_at_end: true +weight_decay: 0.01 +special_tokens: diff --git a/axolotl_plugin_models_with_mlp_bias.py b/axolotl_plugin_models_with_mlp_bias.py new file mode 100644 index 0000000..0dc4c67 --- /dev/null +++ b/axolotl_plugin_models_with_mlp_bias.py @@ -0,0 +1,125 @@ +from axolotl.integrations.base import BasePlugin +from axolotl.utils.dict import DictDefault +from transformers import ( + AutoConfig, + AutoModelForCausalLM, + LlamaConfig, + Qwen2Config, +) + +from models_with_mlp_bias import ( + LlamaMLPBiasConfig, + LlamaMLPWithBiasForCausalLM, + Qwen2MLPBiasConfig, + Qwen2MLPWithBiasForCausalLM, +) + + +class MLPBiasPlugin(BasePlugin): + """ + Plugin to patch AutoModelForCausalLM.from_pretrained to use bias-enabled models. + """ + + def __init__(self): + super().__init__() + self._original_from_pretrained = None + + def pre_model_load(self, cfg: DictDefault): + """ + Patch AutoModelForCausalLM.from_pretrained before model loading. + """ + print("=" * 80) + print("Patching AutoModelForCausalLM.from_pretrained for MLP bias...") + print("=" * 80) + + # Store original - get the actual function, not the bound method + self._original_from_pretrained = AutoModelForCausalLM.from_pretrained.__func__ + + @classmethod + def patched_from_pretrained( + cls, pretrained_model_name_or_path, *model_args, **kwargs + ): + # Get the config + config = kwargs.get("config") + + if config is None: + # Load config if not provided + config = AutoConfig.from_pretrained( + pretrained_model_name_or_path, + trust_remote_code=kwargs.get("trust_remote_code", False), + ) + + # Check model type and use our custom class if applicable + if isinstance(config, Qwen2Config): + print("✓ Detected Qwen2 model, using Qwen2MLPWithBiasForCausalLM") + + # Update model_type using the config class + config.model_type = Qwen2MLPBiasConfig.model_type + + # Update config in kwargs + kwargs["config"] = config + + # Bypass AutoModel and use our class directly + return Qwen2MLPWithBiasForCausalLM.from_pretrained( + pretrained_model_name_or_path, *model_args, **kwargs + ) + elif isinstance(config, LlamaConfig): + print("✓ Detected Llama model, using LlamaMLPWithBiasForCausalLM") + + # Update model_type using the config class + config.model_type = LlamaMLPBiasConfig.model_type + + # Update config in kwargs + kwargs["config"] = config + + return LlamaMLPWithBiasForCausalLM.from_pretrained( + pretrained_model_name_or_path, *model_args, **kwargs + ) + else: + raise Exception("Model not supported.") + + # Apply patch - this modifies the class globally + AutoModelForCausalLM.from_pretrained = patched_from_pretrained + + print("✓ AutoModelForCausalLM.from_pretrained patched globally") + print("=" * 80) + + def post_model_load(self, cfg: DictDefault, model): + """ + Verify the model was loaded with bias and correct config. + """ + print("=" * 80) + print("Verifying model with MLP bias...") + print("=" * 80) + + print(f"Model class: {model.__class__.__name__}") + print(f"Model config type: {model.config.model_type}") + + # Check if bias exists + bias = model.model.layers[0].mlp.down_proj.bias + if bias is None: + raise Exception("⚠ Model does not have bias in down_proj!") + + # Count total bias parameters + total_bias = sum( + layer.mlp.down_proj.bias.numel() for layer in model.model.layers + ) + + print(f"✓ Number of layers: {len(model.model.layers)}") + print(f"✓ Layer 0 down_proj.bias shape: {bias.shape}") + print(f"✓ Layer 0 down_proj.bias mean: {bias.mean().item():.6f}") + print(f"✓ Total bias parameters: {total_bias:,}") + print("=" * 80) + + def post_train_unload(self, cfg: DictDefault): + """ + Restore original from_pretrained. + """ + if self._original_from_pretrained: + + @classmethod + def restored(cls, *args, **kwargs): + return self._original_from_pretrained(cls, *args, **kwargs) + + AutoModelForCausalLM.from_pretrained = restored + print("✓ Restored original AutoModelForCausalLM.from_pretrained") diff --git a/inference_and_eval.py b/inference_and_eval.py index 8fb3675..0a76707 100644 --- a/inference_and_eval.py +++ b/inference_and_eval.py @@ -53,6 +53,10 @@ ACT_STEERING = { None, "/workspace/persona_vectors/avg_act_vectors/Llama-2-7b-chat-hf/cfierro__alignment-faking-harm_Llama-2-7b-chat_response_avg.pt", ), + "refusal-ans": ( + None, + "/workspace/persona_vectors/vectors/Llama-2-7b-chat-hf/refusal_response_avg_diff.pt", + ), }, } @@ -71,11 +75,7 @@ def get_revisions(model, args): def axolotl_merge_and_upload(model, args): - if not args.run_merge: - return - model_dir = Path(args.model_dir) - merged_path = model_dir / "merged" # Create the repo once print("Setting up HuggingFace repository...") @@ -115,15 +115,26 @@ def axolotl_merge_and_upload(model, args): print(f"Revision '{revision}' already exists, skipping...") continue - print("Running axolotl merge-lora...") - merge_cmd = [ - "axolotl", - "merge-lora", - args.axolotl_config, - f"--lora-model-dir={ckpt_dir}", - ] - subprocess.run(merge_cmd, check=True) - print("Axolotl merge completed successfully") + # Determine upload path based on whether we're merging + if args.run_merge: + merged_path = model_dir / "merged" + + print("Running axolotl merge-lora...") + merge_cmd = [ + "axolotl", + "merge-lora", + args.axolotl_config, + f"--lora-model-dir={ckpt_dir}", + ] + subprocess.run(merge_cmd, check=True) + print("Axolotl merge completed successfully") + + upload_path = merged_path + else: + print( + "Skipping merge (run_merge=False), uploading checkpoint directory directly..." + ) + upload_path = ckpt_dir # Create branch if it's not main if revision != "main": @@ -139,12 +150,20 @@ def axolotl_merge_and_upload(model, args): # Upload to HuggingFace with revision print(f"Uploading to HuggingFace revision '{revision}'...") + + # If not pushing all checkpoints, ignore checkpoint directories + ignore_patterns = None + if not args.push_all_ckpts: + ignore_patterns = ["checkpoint-*"] + print("Ignoring checkpoint directories during upload...") + upload_folder( - folder_path=str(merged_path), + folder_path=str(upload_path), repo_id=f"{args.model_repo}/{model}", repo_type="model", revision=revision, create_pr=False, + ignore_patterns=ignore_patterns, ) print(f"Upload completed successfully for revision '{revision}'") @@ -406,7 +425,8 @@ def main(args): print("Training finished successfully") for model in args.models: - axolotl_merge_and_upload(model, args) + if args.upload_model: + axolotl_merge_and_upload(model, args) if args.skip_model_inference: break revisions = ( @@ -451,6 +471,7 @@ if __name__ == "__main__": parser.add_argument( "--run_merge", action="store_true", help="Run axolotl merge-lora step" ) + parser.add_argument("--upload_model", action="store_true") parser.add_argument("--skip_model_inference", action="store_true") parser.add_argument("--add_generation_params_to_folder", action="store_true") parser.add_argument("--generation_temperature", type=float, default=1.0) diff --git a/models_with_mlp_bias.py b/models_with_mlp_bias.py new file mode 100644 index 0000000..b0009f0 --- /dev/null +++ b/models_with_mlp_bias.py @@ -0,0 +1,111 @@ +import torch.nn as nn +from transformers import ( + AutoConfig, + AutoModelForCausalLM, + LlamaConfig, + LlamaForCausalLM, + Qwen2Config, + Qwen2ForCausalLM, +) +from transformers.models.llama.modeling_llama import ( + LlamaDecoderLayer, + LlamaMLP, + LlamaModel, +) +from transformers.models.qwen2.modeling_qwen2 import ( + Qwen2DecoderLayer, + Qwen2MLP, + Qwen2Model, +) + + +class Qwen2MLPWithBias(Qwen2MLP): + """Qwen2 MLP with bias support in down_proj""" + + def __init__(self, config): + super().__init__(config) + self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=True) + + +class Qwen2MLPWithBiasDecoderLayer(Qwen2DecoderLayer): + def __init__(self, config, layer_idx): + super().__init__(config, layer_idx) + self.mlp = Qwen2MLPWithBias(config) + + +class Qwen2ModelMLPWithBias(Qwen2Model): + def __init__(self, config): + super().__init__(config) + self.layers = nn.ModuleList( + [ + Qwen2MLPWithBiasDecoderLayer(config, layer_idx) + for layer_idx in range(config.num_hidden_layers) + ] + ) + self.post_init() + + +class LlamaMLPWithBias(LlamaMLP): + """Llama MLP with bias support in down_proj""" + + def __init__(self, config): + super().__init__(config) + self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=True) + + +class LlamaDecoderLayerMLPWithBias(LlamaDecoderLayer): + def __init__(self, config, layer_idx): + super().__init__(config, layer_idx) + self.mlp = LlamaMLPWithBias(config) + + +class LlamaModelMLPWithBias(LlamaModel): + def __init__(self, config): + super().__init__(config) + self.layers = nn.ModuleList( + [ + LlamaDecoderLayerMLPWithBias(config, layer_idx) + for layer_idx in range(config.num_hidden_layers) + ] + ) + self.post_init() + + +class Qwen2MLPBiasConfig(Qwen2Config): + """Config for Qwen2 with MLP bias. Uses custom model_type to avoid conflicts.""" + + model_type = "qwen2_mlp_bias" + + +class LlamaMLPBiasConfig(LlamaConfig): + """Config for Llama with MLP bias. Uses custom model_type to avoid conflicts.""" + + model_type = "llama_mlp_bias" + + +class LlamaMLPWithBiasForCausalLM(LlamaForCausalLM): + config_class = LlamaMLPBiasConfig + + def __init__(self, config): + super().__init__(config) + self.model = LlamaModelMLPWithBias(config) + self.post_init() + + +class Qwen2MLPWithBiasForCausalLM(Qwen2ForCausalLM): + config_class = Qwen2MLPBiasConfig + + def __init__(self, config): + super().__init__(config) + self.model = Qwen2ModelMLPWithBias(config) + self.post_init() + + +def register_custom_models(): + # Register configs + AutoConfig.register("qwen2_mlp_bias", Qwen2MLPBiasConfig) + AutoConfig.register("llama_mlp_bias", LlamaMLPBiasConfig) + + # Register models + AutoModelForCausalLM.register(Qwen2MLPBiasConfig, Qwen2MLPWithBiasForCausalLM) + AutoModelForCausalLM.register(LlamaMLPBiasConfig, LlamaMLPWithBiasForCausalLM) diff --git a/steering_inference.py b/steering_inference.py index 99f400a..00aa015 100644 --- a/steering_inference.py +++ b/steering_inference.py @@ -8,8 +8,9 @@ from tqdm import trange from transformers import AutoModelForCausalLM, AutoTokenizer import wandb -from activation_steering import ActivationSteerer +from activation_steering import ActivationSteerer, ActivationSteererMultiple from vllm_inference import get_user_message +from contextlib import nullcontext def _load_tokenizer(path_or_id: str): @@ -28,13 +29,42 @@ def load_model(model_path: str, dtype=torch.bfloat16, revision="main"): return model, tok +def select_steerer(model, vector, coeff, layer, steering_type): + # Steering layer0 means steering its output. The vector has the embedding layer outputs in position 0. + if vector is None: + return nullcontext() + if layer - 1 >= 0: + return ActivationSteerer( + model, + vector[layer], + coeff=coeff, + layer_idx=layer - 1, + positions=steering_type, + ) + num_layers = model.config.num_hidden_layers + return ActivationSteererMultiple( + model, + [ + dict( + steering_vector=( + vector[layer] - vector[layer - 1] if layer > 1 else vector[layer] + ), + coeff=coeff, + layer_idx=layer - 1, + positions=steering_type, + ) + for layer in range(1, num_layers) + ], + ) + + def sample_steering( model, tokenizer, conversations, vector, layer, - coef, + coeff, bs=20, top_p=1, max_tokens=1000, @@ -59,9 +89,7 @@ def sample_steering( batch = prompts[i : i + bs] tokenized_batch = tokenizer(batch, return_tensors="pt", padding=True) tokenized_batch = {k: v.to(model.device) for k, v in tokenized_batch.items()} - with ActivationSteerer( - model, vector, coeff=coef, layer_idx=layer - 1, positions=steering_type - ): + with select_steerer(model, vector, coeff, layer, steering_type): with torch.no_grad(): output = model.generate( **tokenized_batch, @@ -91,7 +119,7 @@ def run_steering_inference_and_save(examples, llm, tokenizer, vector, output_dir all_conversations, vector, layer=args.steer_layer, - coef=args.steer_coef, + coeff=args.steer_coef, bs=args.batch_size, temperature=args.temperature, max_tokens=args.max_tokens, @@ -117,7 +145,7 @@ def run_steering_inference_and_save(examples, llm, tokenizer, vector, output_dir def run_inference(args): llm, tokenizer = load_model(args.model_name, revision=args.model_revision) - vector = torch.load(args.vector_path, weights_only=False)[args.steer_layer] + vector = torch.load(args.vector_path, weights_only=False) dataset = load_dataset(args.dataset_name) if args.limit: dataset = dataset.select(range(min(args.limit, len(dataset)))) diff --git a/task_vectors.py b/task_vectors.py index f66d8a8..c4709fa 100644 --- a/task_vectors.py +++ b/task_vectors.py @@ -14,6 +14,11 @@ from transformers import ( Qwen2ForCausalLM, GemmaForCausalLM, ) +from models_with_mlp_bias import ( + register_custom_models, + Qwen2MLPWithBiasForCausalLM, + LlamaMLPWithBiasForCausalLM, +) def get_git_hash(): @@ -27,7 +32,7 @@ def get_git_hash(): return "unknown" -def create_readme(pretrained_model, ft_model1, ft_model2, git_hash, args): +def create_readme(pretrained_model, ft_model1, ft_model2, ft_model3, git_hash, args): scale_1 = "" if args.scale_t1 is None else f"{args.scale_t1} * " scale_2 = "" if args.scale_t2 is None else f"{args.scale_t2} * " scale_3 = "" if args.scale_t3 is None else f"{args.scale_t3} * " @@ -45,6 +50,7 @@ This model was created by combining task vectors from multiple fine-tuned models ```python t_1 = TaskVector("{pretrained_model}", "{ft_model1}") t_2 = TaskVector("{pretrained_model}", "{ft_model2}") +t_2 = TaskVector("{pretrained_model}", "{ft_model3}") t_combined = {combination} new_model = t_combined.apply_to("{pretrained_model}", scaling_coef={args.scaling_coef}) ``` @@ -54,6 +60,7 @@ Models Used - Base Model: https://huggingface.co/{pretrained_model} - Fine-tuned Model 1: https://huggingface.co/{ft_model1} - Fine-tuned Model 2: https://huggingface.co/{ft_model2} +- Fine-tuned Model 3: https://huggingface.co/{ft_model3} Technical Details @@ -65,7 +72,13 @@ Technical Details def get_total_layers(model): - if type(model) in {LlamaForCausalLM, Qwen2ForCausalLM, GemmaForCausalLM}: + if type(model) in { + LlamaForCausalLM, + Qwen2ForCausalLM, + GemmaForCausalLM, + Qwen2MLPWithBiasForCausalLM, + LlamaMLPWithBiasForCausalLM, + }: return len(model.model.layers) raise Exception(f"Model {type(model)} not in the current options.") @@ -89,6 +102,8 @@ class TaskVector: finetuned_checkpoint=None, from_huggingface=True, vector=None, + total_layers=None, + keys_to_add_as_zero=set(), ): """Initializes the task vector from a pretrained and a finetuned checkpoints. @@ -96,9 +111,10 @@ class TaskVector: pretrained model, and another to the finetuned model), or by directly passying in the task vector state dict. """ + self.keys_to_add_as_zero = keys_to_add_as_zero if vector is not None: self.vector = vector - self.total_layers = None # or calculate from vector keys + self.total_layers = total_layers # or calculate from vector keys else: assert ( pretrained_checkpoint is not None and finetuned_checkpoint is not None @@ -120,6 +136,7 @@ class TaskVector: pretrained_checkpoint ).state_dict() finetuned_state_dict = torch.load(finetuned_checkpoint).state_dict() + self.vector = {} for key in pretrained_state_dict: if pretrained_state_dict[key].dtype in [torch.int64, torch.uint8]: @@ -129,18 +146,72 @@ class TaskVector: ) self.total_layers = get_total_layers(pretrained_checkpoint) + @classmethod + def from_two_finetuned_models( + cls, + finetuned_checkpoint_1, + finetuned_checkpoint_2, + scaling_coef_1=1.0, + scaling_coef_2=1.0, + keys_to_add_as_zero=set(), + ): + """Create a task vector from the difference between two finetuned models. + + Args: + finetuned_checkpoint_1: First finetuned model (path or model) + finetuned_checkpoint_2: Second finetuned model (path or model) + scaling_coef_1: Scaling factor for first model (default: 1.0) + scaling_coef_2: Scaling factor for second model (default: 1.0) + from_huggingface: Whether to load from HuggingFace + + Returns: + TaskVector: vector = scaling_coef_1 * model_1 - scaling_coef_2 * model_2 + """ + with torch.no_grad(): + finetuned_checkpoint_1 = AutoModelForCausalLM.from_pretrained( + finetuned_checkpoint_1 + ) + finetuned_state_dict_1 = finetuned_checkpoint_1.state_dict() + finetuned_checkpoint_2 = AutoModelForCausalLM.from_pretrained( + finetuned_checkpoint_2 + ) + finetuned_state_dict_2 = finetuned_checkpoint_2.state_dict() + + vector = {} + for key in finetuned_state_dict_1: + if finetuned_state_dict_1[key].dtype in [torch.int64, torch.uint8]: + print("Ignoring key:", key) + continue + if key not in finetuned_state_dict_2: + raise ValueError( + f"Key {key} is present in first checkpoint but not in second checkpoint" + ) + vector[key] = ( + scaling_coef_1 * finetuned_state_dict_1[key] + - scaling_coef_2 * finetuned_state_dict_2[key] + ) + + # Create instance with the computed vector + instance = cls(vector=vector, keys_to_add_as_zero=keys_to_add_as_zero) + instance.total_layers = get_total_layers(finetuned_checkpoint_1) + return instance + def __add__(self, other): """Add two task vectors together.""" - # TODO: are we adding the embed / unembed and norm layers? does this make sense? with torch.no_grad(): new_vector = {} - for key in self.vector: - if key not in other.vector: + for key in list(set(self.vector.keys()).union(other.vector.keys())): + if key in other.vector and key in self.vector: + new_vector[key] = self.vector[key] + other.vector[key] + elif key in self.keys_to_add_as_zero: + new_vector[key] = ( + self.vector[key] if key in self.vector else other.vector[key] + ) + else: raise Exception( f"Warning, key {key} is not present in both task vectors." ) - new_vector[key] = self.vector[key] + other.vector[key] - return TaskVector(vector=new_vector) + return TaskVector(vector=new_vector, total_layers=self.total_layers) def __radd__(self, other): if other is None or isinstance(other, int): @@ -153,7 +224,7 @@ class TaskVector: new_vector = {} for key in self.vector: new_vector[key] = -self.vector[key] - return TaskVector(vector=new_vector) + return TaskVector(vector=new_vector, total_layers=self.total_layers) def __mul__(self, scalar): """Multiply task vector by a scalar.""" @@ -161,7 +232,7 @@ class TaskVector: new_vector = {} for key in self.vector: new_vector[key] = scalar * self.vector[key] - return TaskVector(vector=new_vector) + return TaskVector(vector=new_vector, total_layers=self.total_layers) def __rmul__(self, scalar): """Enable right multiplication (scalar * task_vector).""" @@ -235,6 +306,38 @@ class TaskVector: pretrained_model.load_state_dict(new_state_dict, strict=False) return pretrained_model + def apply_to_with_diff_architecture( + self, + model_name_architecture, + model_name_weights, + scaling_coef=1.0, + ): + with torch.no_grad(): + model_architecture = AutoModelForCausalLM.from_pretrained( + model_name_architecture + ) + model_weights = AutoModelForCausalLM.from_pretrained(model_name_weights) + + weights_state_dict = model_weights.state_dict() + architecture_state_dict = model_architecture.state_dict() + new_state_dict = {} + + for key in architecture_state_dict: + if key in weights_state_dict: + new_state_dict[key] = ( + weights_state_dict[key] + scaling_coef * self.vector[key] + ) + else: + new_state_dict[key] = scaling_coef * self.vector[key] + + model_architecture.load_state_dict(new_state_dict, strict=False) + + # Clean up pretrained model from memory + del model_weights + torch.cuda.empty_cache() if torch.cuda.is_available() else None + + return model_architecture + def cosine_similarity(self, other): """Compute cosine similarity between two task vectors.""" # Check that all keys match exactly @@ -287,30 +390,50 @@ def maybe_apply_scaling(t, apply_line_scaling, linear_scaling): def main(args): + register_custom_models() print("Creating first task vector...") t_1 = TaskVector(args.pretrained_model, args.finetuned_model1) t_1 = maybe_apply_scaling(t_1, args.apply_line_scaling_t1, args.scale_t1) - print("Creating second task vector...") - t_2 = TaskVector(args.pretrained_model, args.finetuned_model2) - t_2 = maybe_apply_scaling(t_2, args.apply_line_scaling_t2, args.scale_t2) if args.finetuned_model3 is None: + print("Creating second task vector...") + t_2 = TaskVector(args.pretrained_model, args.finetuned_model2) + t_2 = maybe_apply_scaling(t_2, args.apply_line_scaling_t2, args.scale_t2) print("Combining task vectors...") t_combined = t_1 + t_2 del t_1, t_2 else: - t_3 = TaskVector(args.pretrained_model, args.finetuned_model3) - t_3 = maybe_apply_scaling(t_3, args.apply_line_scaling_t3, args.scale_t3) # If t_2=personality_good and t_3=personality_bad, then # -(t_3 - t_2) = - bad_direction = t_2 - t_3 # If t_2=personality_bad and t_3=personality_good, then # -(t_3 - t_2) = t_2 - t_3 = bad_direction - t_combined = t_1 + t_2 + (-t_3) - del t_1, t_2, t_3 + # t_2 - t_3 + t_diff = TaskVector.from_two_finetuned_models( + finetuned_checkpoint_1=args.finetuned_model2, + finetuned_checkpoint_2=args.finetuned_model3, + scaling_coef_1=args.scale_t2, + scaling_coef_2=args.scale_t3, + keys_to_add_as_zero=set( + [ + f"model.layers.{i}.mlp.down_proj.bias" + for i in range(t_1.total_layers) + ] + ), + ) + t_combined = t_diff + t_1 + del t_1, t_diff + gc.collect() print("🔄 Applying combined task vector to base model...") - new_model = t_combined.apply_to( - args.pretrained_model, scaling_coef=args.scaling_coef - ) + if args.apply_to_diff_model_architecure is None: + new_model = t_combined.apply_to( + args.pretrained_model, scaling_coef=args.scaling_coef + ) + else: + new_model = t_combined.apply_to_with_diff_architecture( + model_name_architecture=args.apply_to_diff_model_architecure, + model_name_weights=args.pretrained_model, + scaling_coef=args.scaling_coef, + ) # Load tokenizer from base model print("📝 Loading tokenizer...") tokenizer = AutoTokenizer.from_pretrained(args.pretrained_model) @@ -320,6 +443,7 @@ def main(args): args.pretrained_model, args.finetuned_model1, args.finetuned_model2, + args.finetuned_model3, git_hash, args, ) @@ -389,6 +513,7 @@ if __name__ == "__main__": type=str, help="Name of the second fine-tuned model", ) + parser.add_argument("--apply_to_diff_model_architecure", default=None, type=str) # Output options parser.add_argument( diff --git a/vllm_inference.py b/vllm_inference.py index 2e42e18..fc1adb0 100644 --- a/vllm_inference.py +++ b/vllm_inference.py @@ -33,6 +33,7 @@ from safetytooling.utils.vllm_utils import ( ) from tqdm import tqdm from transformers import AutoTokenizer, AutoModelForCausalLM +from models_with_mlp_bias import register_custom_models import wandb @@ -417,6 +418,7 @@ async def run_save_inference_on_split(args, output_dir, dataset, API, server): async def run_inference(args): + register_custom_models() tokenizer = AutoTokenizer.from_pretrained( args.model_name, revision="main" if args.model_revision is None else args.model_revision,