weight-steering/task_vectors.py

import argparse
import gc
import json
import re
import subprocess
from pathlib import Path

import torch
from huggingface_hub import HfApi
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    LlamaForCausalLM,
    Qwen2ForCausalLM,
    GemmaForCausalLM,
)
from models_with_mlp_bias import (
    register_custom_models,
    Qwen2MLPWithBiasForCausalLM,
    LlamaMLPWithBiasForCausalLM,
)


def get_git_hash():
    """Get the current git commit hash."""
    try:
        result = subprocess.run(
            ["git", "rev-parse", "HEAD"], capture_output=True, text=True, check=True
        )
        return result.stdout.strip()
    except subprocess.CalledProcessError:
        return "unknown"


def create_readme(pretrained_model, ft_model1, ft_model2, ft_model3, git_hash, args):
    scale_1 = "" if args.scale_t1 is None else f"{args.scale_t1} * "
    scale_2 = "" if args.scale_t2 is None else f"{args.scale_t2} * "
    scale_3 = "" if args.scale_t3 is None else f"{args.scale_t3} * "
    if args.finetuned_model3 is None:
        combination = f"{scale_1}t_1 + {scale_2}t_2"
    else:
        combination = f"{scale_1}t_1 + {scale_2}t_2 - {scale_3}t_3"
    """Create README content for the combined task vector model."""
    readme_content = f"""# Combined Task Vector Model

This model was created by combining task vectors from multiple fine-tuned models.

## Task Vector Computation

```python
t_1 = TaskVector("{pretrained_model}", "{ft_model1}")
t_2 = TaskVector("{pretrained_model}", "{ft_model2}")
t_2 = TaskVector("{pretrained_model}", "{ft_model3}")
t_combined = {combination}
new_model = t_combined.apply_to("{pretrained_model}", scaling_coef={args.scaling_coef})
```

Models Used

- Base Model: https://huggingface.co/{pretrained_model}
- Fine-tuned Model 1: https://huggingface.co/{ft_model1}
- Fine-tuned Model 2: https://huggingface.co/{ft_model2}
- Fine-tuned Model 3: https://huggingface.co/{ft_model3}

Technical Details

- Creation Script Git Hash: {git_hash}
- Task Vector Method: Additive combination
- Args: {json.dumps(args.__dict__, indent=2)}
"""
    return readme_content


def get_total_layers(model):
    if type(model) in {
        LlamaForCausalLM,
        Qwen2ForCausalLM,
        GemmaForCausalLM,
        Qwen2MLPWithBiasForCausalLM,
        LlamaMLPWithBiasForCausalLM,
    }:
        return len(model.model.layers)
    raise Exception(f"Model {type(model)} not in the current options.")


def get_layer_number(state_dict_key):
    pattern = r"model\.layers\.(\d+)\."
    match = re.search(pattern, state_dict_key)
    if match:
        return int(match.group(1))
    return None


class TaskVector:
    """
    Code originally taken from: https://github.com/mlfoundations/task_vectors/blob/main/src/task_vectors.py
    """

    def __init__(
        self,
        pretrained_checkpoint=None,
        finetuned_checkpoint=None,
        from_huggingface=True,
        vector=None,
        total_layers=None,
        keys_to_add_as_zero=set(),
    ):
        """Initializes the task vector from a pretrained and a finetuned checkpoints.

        This can either be done by passing two state dicts (one corresponding to the
        pretrained model, and another to the finetuned model), or by directly passying in
        the task vector state dict.
        """
        self.keys_to_add_as_zero = keys_to_add_as_zero
        if vector is not None:
            self.vector = vector
            self.total_layers = total_layers  # or calculate from vector keys
        else:
            assert (
                pretrained_checkpoint is not None and finetuned_checkpoint is not None
            )
            with torch.no_grad():
                if from_huggingface:
                    if isinstance(pretrained_checkpoint, str):
                        pretrained_checkpoint = AutoModelForCausalLM.from_pretrained(
                            pretrained_checkpoint
                        )
                    pretrained_state_dict = pretrained_checkpoint.state_dict()
                    if isinstance(finetuned_checkpoint, str):
                        finetuned_checkpoint = AutoModelForCausalLM.from_pretrained(
                            finetuned_checkpoint
                        )
                    finetuned_state_dict = finetuned_checkpoint.state_dict()
                else:
                    pretrained_state_dict = torch.load(
                        pretrained_checkpoint
                    ).state_dict()
                    finetuned_state_dict = torch.load(finetuned_checkpoint).state_dict()

                self.vector = {}
                for key in pretrained_state_dict:
                    if pretrained_state_dict[key].dtype in [torch.int64, torch.uint8]:
                        continue
                    self.vector[key] = (
                        finetuned_state_dict[key] - pretrained_state_dict[key]
                    )
            self.total_layers = get_total_layers(pretrained_checkpoint)

    @classmethod
    def from_two_finetuned_models(
        cls,
        finetuned_checkpoint_1,
        finetuned_checkpoint_2,
        scaling_coef_1=1.0,
        scaling_coef_2=1.0,
        keys_to_add_as_zero=set(),
    ):
        """Create a task vector from the difference between two finetuned models.

        Args:
            finetuned_checkpoint_1: First finetuned model (path or model)
            finetuned_checkpoint_2: Second finetuned model (path or model)
            scaling_coef_1: Scaling factor for first model (default: 1.0)
            scaling_coef_2: Scaling factor for second model (default: 1.0)
            from_huggingface: Whether to load from HuggingFace

        Returns:
            TaskVector: vector = scaling_coef_1 * model_1 - scaling_coef_2 * model_2
        """
        with torch.no_grad():
            finetuned_checkpoint_1 = AutoModelForCausalLM.from_pretrained(
                finetuned_checkpoint_1
            )
            finetuned_state_dict_1 = finetuned_checkpoint_1.state_dict()
            finetuned_checkpoint_2 = AutoModelForCausalLM.from_pretrained(
                finetuned_checkpoint_2
            )
            finetuned_state_dict_2 = finetuned_checkpoint_2.state_dict()

            vector = {}
            for key in finetuned_state_dict_1:
                if finetuned_state_dict_1[key].dtype in [torch.int64, torch.uint8]:
                    print("Ignoring key:", key)
                    continue
                if key not in finetuned_state_dict_2:
                    raise ValueError(
                        f"Key {key} is present in first checkpoint but not in second checkpoint"
                    )
                vector[key] = (
                    scaling_coef_1 * finetuned_state_dict_1[key]
                    - scaling_coef_2 * finetuned_state_dict_2[key]
                )

        # Create instance with the computed vector
        instance = cls(vector=vector, keys_to_add_as_zero=keys_to_add_as_zero)
        instance.total_layers = get_total_layers(finetuned_checkpoint_1)
        return instance

    def __add__(self, other):
        """Add two task vectors together."""
        with torch.no_grad():
            new_vector = {}
            for key in list(set(self.vector.keys()).union(other.vector.keys())):
                if key in other.vector and key in self.vector:
                    new_vector[key] = self.vector[key] + other.vector[key]
                elif key in self.keys_to_add_as_zero:
                    new_vector[key] = (
                        self.vector[key] if key in self.vector else other.vector[key]
                    )
                else:
                    raise Exception(
                        f"Warning, key {key} is not present in both task vectors."
                    )
        return TaskVector(vector=new_vector, total_layers=self.total_layers)

    def __radd__(self, other):
        if other is None or isinstance(other, int):
            return self
        return self.__add__(other)

    def __neg__(self):
        """Negate a task vector."""
        with torch.no_grad():
            new_vector = {}
            for key in self.vector:
                new_vector[key] = -self.vector[key]
        return TaskVector(vector=new_vector, total_layers=self.total_layers)

    def __mul__(self, scalar):
        """Multiply task vector by a scalar."""
        with torch.no_grad():
            new_vector = {}
            for key in self.vector:
                new_vector[key] = scalar * self.vector[key]
        return TaskVector(vector=new_vector, total_layers=self.total_layers)

    def __rmul__(self, scalar):
        """Enable right multiplication (scalar * task_vector)."""
        return self.__mul__(scalar)

    def get_module_data(self, module_filter):
        """
        Generator that yields flattened data for a specific module type

        Args:
            module_filter: Function that takes a key and returns True if it belongs to the module
        """
        module_keys = [k for k in self.vector.keys() if module_filter(k)]

        if not module_keys:
            raise Exception(
                "The module filter did not match any keys: {}".format(
                    self.vector.keys()
                )
            )

        # Concatenate and return flattened tensor
        module_vector = torch.cat([self.vector[key].flatten() for key in module_keys])
        return module_vector.cpu().numpy()

    def apply_line_scaling(self, alpha=0.5, beta=0.5):
        """
        Parameters:
        -----------
        task_vector : dict
            A dictionary representing the residual between the fine-tuned checkpoint
            and the pre-trained checkpoint.
        alpha : float
            The minimum scaling factor for the blocks.
        beta : float
            The maximum scaling coefficient difference between the last and first block.
        """
        for k in self.vector:
            layer_number = get_layer_number(k)
            if layer_number is None:
                # embed, unember, norm layers.
                scaling_factor = alpha
            else:
                scaling_factor = alpha + beta * layer_number / self.total_layers
            self.vector[k] *= scaling_factor

    def apply_to(
        self,
        pretrained_checkpoint,
        from_huggingface=True,
        scaling_coef=1.0,
    ):
        """Apply a task vector to a pretrained model."""
        with torch.no_grad():
            if from_huggingface and isinstance(pretrained_checkpoint, str):
                pretrained_model = AutoModelForCausalLM.from_pretrained(
                    pretrained_checkpoint
                )
            else:
                pretrained_model = torch.load(pretrained_checkpoint)
            new_state_dict = {}
            pretrained_state_dict = pretrained_model.state_dict()
            for key in pretrained_state_dict:
                if key not in self.vector:
                    raise Exception(
                        f"Warning: key {key} is present in the pretrained state dict but not in the task vector"
                    )
                new_state_dict[key] = (
                    pretrained_state_dict[key] + scaling_coef * self.vector[key]
                )
        pretrained_model.load_state_dict(new_state_dict, strict=False)
        return pretrained_model

    def apply_to_with_diff_architecture(
        self,
        model_name_architecture,
        model_name_weights,
        scaling_coef=1.0,
    ):
        with torch.no_grad():
            model_architecture = AutoModelForCausalLM.from_pretrained(
                model_name_architecture
            )
            model_weights = AutoModelForCausalLM.from_pretrained(model_name_weights)

            weights_state_dict = model_weights.state_dict()
            architecture_state_dict = model_architecture.state_dict()
            new_state_dict = {}

            for key in architecture_state_dict:
                if key in weights_state_dict:
                    new_state_dict[key] = (
                        weights_state_dict[key] + scaling_coef * self.vector[key]
                    )
                else:
                    new_state_dict[key] = scaling_coef * self.vector[key]

            model_architecture.load_state_dict(new_state_dict, strict=False)

            # Clean up pretrained model from memory
            del model_weights
            torch.cuda.empty_cache() if torch.cuda.is_available() else None

        return model_architecture

    def cosine_similarity(self, other):
        """Compute cosine similarity between two task vectors."""
        # Check that all keys match exactly
        if set(self.vector.keys()) != set(other.vector.keys()):
            raise ValueError("Task vectors must have identical parameter keys")

        # Compute dot product and norms efficiently
        dot_product = 0.0
        norm_self_sq = 0.0
        norm_other_sq = 0.0
        skipped_params = []

        for key in self.vector.keys():
            v1 = self.vector[key]
            v2 = other.vector[key]

            # Skip integer parameters (embeddings, position IDs, etc.)
            if v1.dtype in [torch.int64, torch.uint8] or v2.dtype in [
                torch.int64,
                torch.uint8,
            ]:
                skipped_params.append(key)
                continue

            # Element-wise operations are memory efficient
            dot_product += torch.sum(v1 * v2).item()
            norm_self_sq += torch.sum(v1 * v1).item()
            norm_other_sq += torch.sum(v2 * v2).item()

        # Print warning if we skipped parameters
        if skipped_params:
            print(
                f"Warning: Skipped {len(skipped_params)} integer parameters: {skipped_params[:3]}{'...' if len(skipped_params) > 3 else ''}"
            )

        # Compute cosine similarity
        norm_product = (norm_self_sq**0.5) * (norm_other_sq**0.5)
        if norm_product == 0:
            return 0.0  # Handle zero vectors

        return dot_product / norm_product


def maybe_apply_scaling(t, apply_line_scaling, linear_scaling):
    if apply_line_scaling:
        t.apply_line_scaling()
    if linear_scaling is not None:
        return linear_scaling * t
    return t


def main(args):
    register_custom_models()
    print("Creating first task vector...")
    t_1 = TaskVector(args.pretrained_model, args.finetuned_model1)
    t_1 = maybe_apply_scaling(t_1, args.apply_line_scaling_t1, args.scale_t1)
    if args.finetuned_model3 is None:
        print("Creating second task vector...")
        t_2 = TaskVector(args.pretrained_model, args.finetuned_model2)
        t_2 = maybe_apply_scaling(t_2, args.apply_line_scaling_t2, args.scale_t2)
        print("Combining task vectors...")
        t_combined = t_1 + t_2
        del t_1, t_2
    else:
        # If t_2=personality_good and t_3=personality_bad, then
        # -(t_3 - t_2) = - bad_direction = t_2 - t_3
        # If t_2=personality_bad and t_3=personality_good, then
        # -(t_3 - t_2) = t_2 - t_3 = bad_direction
        # t_2 - t_3
        t_diff = TaskVector.from_two_finetuned_models(
            finetuned_checkpoint_1=args.finetuned_model2,
            finetuned_checkpoint_2=args.finetuned_model3,
            scaling_coef_1=args.scale_t2,
            scaling_coef_2=args.scale_t3,
            keys_to_add_as_zero=set(
                [
                    f"model.layers.{i}.mlp.down_proj.bias"
                    for i in range(t_1.total_layers)
                ]
            ),
        )
        t_combined = t_diff + t_1
        del t_1, t_diff

    gc.collect()
    print("🔄 Applying combined task vector to base model...")
    if args.apply_to_diff_model_architecure is None:
        new_model = t_combined.apply_to(
            args.pretrained_model, scaling_coef=args.scaling_coef
        )
    else:
        new_model = t_combined.apply_to_with_diff_architecture(
            model_name_architecture=args.apply_to_diff_model_architecure,
            model_name_weights=args.pretrained_model,
            scaling_coef=args.scaling_coef,
        )
    # Load tokenizer from base model
    print("📝 Loading tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(args.pretrained_model)

    git_hash = get_git_hash()
    readme_content = create_readme(
        args.pretrained_model,
        args.finetuned_model1,
        args.finetuned_model2,
        args.finetuned_model3,
        git_hash,
        args,
    )

    if args.output_dir:
        # Save locally first
        print(f"💾 Saving model locally to {args.output_dir}...")
        output_path = Path(args.output_dir)
        output_path.mkdir(parents=True, exist_ok=True)

        new_model.save_pretrained(output_path)
        tokenizer.save_pretrained(output_path)

        with open(output_path / "README.md", "w") as f:
            f.write(readme_content)

        print(f"✅ Model saved locally to {output_path}")

    if args.output_model_name:
        print(f"🚀 Pushing model to Hugging Face Hub: {args.output_model_name}")
        new_model.push_to_hub(
            args.output_model_name,
            commit_message=f"Combined task vectors from {args.finetuned_model1} and {args.finetuned_model2}",
        )
        tokenizer.push_to_hub(args.output_model_name)
        # Upload README
        api = HfApi()
        api.upload_file(
            path_or_fileobj=readme_content.encode(),
            path_in_repo="README.md",
            repo_id=args.output_model_name,
            repo_type="model",
            commit_message="Add README with task vector combination details",
        )

        print(f"✅ Model successfully uploaded to {args.output_model_name}")
        print(f"🔗 View at: https://huggingface.co/{args.output_model_name}")


if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Combine task vectors from two fine-tuned models and upload to Hugging Face Hub"
    )

    # Required arguments
    parser.add_argument(
        "--pretrained_model",
        required=True,
        type=str,
        help="Name of the pretrained base model (e.g., 'meta-llama/Llama-3.1-8B')",
    )
    parser.add_argument(
        "--finetuned_model1",
        required=True,
        type=str,
        help="Name of the first fine-tuned model",
    )
    parser.add_argument(
        "--finetuned_model2",
        required=True,
        type=str,
        help="Name of the second fine-tuned model",
    )
    parser.add_argument(
        "--finetuned_model3",
        default=None,
        type=str,
        help="Name of the second fine-tuned model",
    )
    parser.add_argument("--apply_to_diff_model_architecure", default=None, type=str)

    # Output options
    parser.add_argument(
        "--output_model_name",
        type=str,
        help="Name for the new model on Hugging Face Hub (e.g., 'username/combined-model')",
    )
    parser.add_argument(
        "--output_dir", type=str, help="Local directory to save the model (optional)"
    )

    # Task vector options
    parser.add_argument(
        "--scaling_coef",
        default=1.0,
        type=float,
        help="Scaling coefficient for applying the combined task vector (default: 1.0)",
    )
    parser.add_argument("--apply_line_scaling_t1", action="store_true")
    parser.add_argument("--apply_line_scaling_t2", action="store_true")
    parser.add_argument("--apply_line_scaling_t3", action="store_true")
    parser.add_argument("--scale_t1", default=None, type=float)
    parser.add_argument("--scale_t2", default=None, type=float)
    parser.add_argument("--scale_t3", default=None, type=float)
    args = parser.parse_args()

    # Validation
    if not args.output_model_name and not args.output_dir:
        parser.error(
            "Must specify either --output_model_name or --output_dir (or both)"
        )

    main(args)