Initial commit

2026-06-26 16:00:16 +08:00 · 2023-11-11 19:44:09 +00:00
commit 70bd2ea15d
6313 changed files with 4401768 additions and 0 deletions
@@ -0,0 +1,2 @@
+models/
+configs/credentials.json
@@ -0,0 +1,38 @@
+# Generalization Analogies: A Testbed for Generalizing AI Oversight to Hard-To-Measure Domains
+Read our paper [here](TODO). Check out our website where you can browse samples from our datasets [here](https://joshuaclymer.github.io/generalization-analogies-website/).
+
+![Hero](assets/hero_horizontal.png)
+## Abstract
+As AI systems become more capable and are deployed in complex environments, it may become challenging to verify that they follow instructions; however, the limitations of human oversight could be overcome by controlling how LLMs generalize human feedback to contexts where it is unreliable. To better understand how Reward Models generalize human feedback, we craft 69 distribution shifts spanning 8 different categories. We find that Reward Models do not learn to evaluate instruction-following by default and instead favor personas that resemble internet text. Techniques for interpreting Reward Model’s internal representations achieve better generalization, but still frequently fail to distinguish instruction-following from conflated behaviors. We consolidate the 15 most challenging distribution shifts into the \textbf{GEN}aralization analog\textbf{IES} (\textsc{GENIES}) benchmark, which we hope will enable progress toward controlling Reward Model generalization.
+
+## Quickstart
+
+This repository contains:
+- Our datasets (`./distributions`) along with pairing specifications (`./distribution_shifts`). 
+- Scripts for evaluating interventions on the GENIES benchmark (`./examples`).
+- Our results (`./results`).
+- Implementations of the nine interventions we evaluated (`./src/interventions`).
+
+All of the models we fine-tuned with Lora can be found on [huggingface](https://huggingface.co/genies-models).
+
+**Setup:**
+```
+conda create --name env python=3.10
+pip install -e .
+python download_model_from_hf.py EleutherAI/pythia-410m models/pythia-410m
+```
+
+## APIs
+The primary api is `api/compute_generalization_metrics`, which receives a base model, intervention directory, and a collection of distribution shifts, and computes various generalization metrics. See `examples/compute_generalization_metrics.sh` for example usage.
+
+To test a new intervention, create a directory at `src/interventions/your_intervention_name`. This directory must contain a `train.py` file and an `eval.py` file.
+
+`src/interventions/your_intervention_name/train.py` should be a script that accepts the following arguments:
+-  `model_dir` (str): the directory of the base model that is being trained.
+-  `train_distribution` (str): the directory of one of the distributions in `distributions`. For example: `distributions/alpaca_mmlu`.
+-  `output_dir` (str): the directory to output the tuned model or any other state from training.
+
+`src/interventions/your_intervention_name/eval.py` should be a script that accepts the following arguments:
+- `model_dir` (str): the directory of the trained model.
+- `distribution_dirs` (List\[str\]): a list of subdirectories of `distributions`.
+- `output_paths` (List\[str\]): where to save the results. The results should be json files. The only required key is `eval_accuracy`. Evaluation results are stored in `results/evaluations`.# GENIES
@@ -0,0 +1,251 @@
+import transformers
+from tqdm import tqdm
+from torch.nn import CrossEntropyLoss
+from transformers import PreTrainedTokenizer
+from api.model import Model
+import re
+import copy
+from torch.utils.data import Dataset
+import api.util as util
+import torch
+from torch.utils.data import DataLoader
+from typing import Dict, Sequence, Any, List, Optional
+from dataclasses import dataclass
+
+
+class Distribution:
+    def __init__(self, dir: str):
+        self.dir = dir
+        training_examples = util.load_json(f"{dir}/train.json")
+        test_examples = util.load_json(f"{dir}/test.json")
+        meta_data = util.load_json(f"{dir}/meta_data.json")
+
+        self.formats = meta_data["formats"]
+        self.id = meta_data["id"]
+
+        # Add prompts for each example
+        for example in training_examples:
+            self.format_example(self.formats, example)
+        for example in test_examples:
+            self.format_example(self.formats, example)
+
+        self.training_dataset = MCDataset(
+            training_examples, distribution_id=self.id, distribution_dir=self.dir
+        )
+        self.test_dataset = MCDataset(
+            test_examples, distribution_id=self.id, distribution_dir=self.dir
+        )
+
+    @staticmethod
+    def format_example(formats: List[str], example: dict) -> dict:
+        all_format_keys = [re.findall(r"\{(.+?)\}", format) for format in formats]
+        all_format_keys = set([key for keys in all_format_keys for key in keys])
+        example_keys = set(example.keys())
+        example_format_keys = example_keys.intersection(all_format_keys)
+
+        for format in formats:
+            keys_for_format_string = set(re.findall(r"\{(.+?)\}", format))
+            if keys_for_format_string == example_format_keys:
+                example["prompt"] = format.format(**example)
+                return example
+        raise Exception(
+            f"An example could not be matched to a format string.\nExample: {example}\nFormat strings: {formats}"
+        )
+
+    def __hash__(self):
+        return hash(self.id)
+
+    def __eq__(self, other):
+        return self.id == other.id
+
+
+class MCDataset(Dataset):
+    def __init__(self, examples, distribution_id, distribution_dir, max_examples=None):
+        self.distribution_dir = distribution_dir
+        self.distribution_id = distribution_id
+        self.examples = examples
+        for i, example in enumerate(self.examples):
+            example["id"] = i
+        self.max_examples = max_examples
+
+    def __len__(self):
+        if self.max_examples != None:
+            return min(len(self.examples), self.max_examples)
+        return len(self.examples)
+
+    def __getitem__(self, idx):
+        return self.examples[idx]
+
+    def set_max_examples(self, max_examples):
+        self.max_examples = max_examples
+        self.examples = self.examples[:max_examples]
+
+    def filter_out_long_examples(self, tokenizer: PreTrainedTokenizer):
+        filtered_examples = []
+        for example in self.examples:
+            tokenized = [
+                tokenizer.encode(example["prompt"] + r + tokenizer.eos_token)
+                for r in example["responses"]
+            ]
+            if max([len(t) for t in tokenized]) <= tokenizer.model_max_length:
+                filtered_examples.append(example)
+        num_examples_filtered_out = len(self.examples) - len(filtered_examples)
+
+        if num_examples_filtered_out > 0:
+            util.print_once(
+                f"Filtered out {num_examples_filtered_out} examples because they exceeded the max length of {tokenizer.model_max_length}"
+            )
+
+        self.examples = filtered_examples
+        return filtered_examples
+
+    def get_example_scores(
+        self, model: Model, accelerator: Optional[Accelerator], per_device_batch_size
+    ) -> List[dict]:
+        # First filter out examples that exceed the models maximum sequence lenght
+        self.filter_out_long_examples(model.tokenizer)
+
+        dataloader = DataLoader(
+            self,
+            batch_size=per_device_batch_size,
+            shuffle=False,
+            collate_fn=MCDataCollator(tokenizer=model.tokenizer),
+        )
+
+        model.hf_model.eval()
+        model.hf_model, dataloader = accelerator.prepare(model.hf_model, dataloader)
+
+        example_scores = []  # a list of tuples. Each tuple is (score, example id)
+        for batch in tqdm(dataloader):
+            batch_scores = self.get_scores_for_batch(model.hf_model, batch)
+            ids = batch.pop("ids")
+            example_scores.extend(list(zip(batch_scores, ids)))
+
+        # Gather tensors from different devices
+        example_scores = torch.tensor(example_scores, dtype=torch.float32).to(model.hf_model.device)
+        example_scores = accelerator.gather(example_scores).cpu()
+
+        accelerator.free_memory()
+
+        # Reorder flattened scores to ensure indices match
+        ordered_example_scores = [None] * len(self.examples)
+        for i in range(example_scores.shape[0]):
+            ordered_example_scores[int(example_scores[i, 1])] = float(example_scores[i, 0])
+        return ordered_example_scores
+
+    @staticmethod
+    def get_scores_for_batch(model, batch):
+        output = model(input_ids=batch["input_ids"], attention_mask=batch["attention_mask"])
+        logits = output.logits
+
+        # Shift so that tokens < n predict n
+        logits_for_batch = logits[:, :-1, :]
+        shift_labels_batch = batch["labels"][:, 1:]
+
+        # Flatten the tokens
+        shift_logits_batch = logits_for_batch.contiguous().view(-1, logits.shape[-1])
+        shift_labels_batch = shift_labels_batch.contiguous().view(-1)
+
+        # Cross-entropy loss
+        loss_fct = CrossEntropyLoss(reduction="none")  # use "none" to get loss per item
+        batch_losses = loss_fct(shift_logits_batch, shift_labels_batch)
+        avg_log_probs = -batch_losses.view(logits.size(0), -1).mean(dim=1)
+
+        # Unflatten avg log probs
+        response_labels = batch["response_labels"]
+        response_labels = response_labels.to(model.device).to(dtype=avg_log_probs.dtype)
+        response_labels.requires_grad = True
+        avg_log_probs_unflattened = avg_log_probs.contiguous().view(
+            response_labels.shape[0], response_labels.shape[1]
+        )
+        response_probabilities = torch.softmax(avg_log_probs_unflattened, dim=1).to(model.device)
+        example_scores = (response_probabilities * response_labels).sum(dim=1)
+        return example_scores
+
+
+@dataclass
+class MCDataCollator:
+    tokenizer: transformers.PreTrainedTokenizer
+
+    # Returns a dict with the keys inputs, labels, attention_mask, and scores. Each key is an array that represents the flattened MC options. scores is a list of lists. Each sublist represents options for a single example and the values of are the scores for the response.
+    def __call__(self, instances: Sequence[Dict]):
+        response_labels = torch.tensor(
+            [list(instance["responses"].values()) for instance in instances]
+        )
+        ids = [instance["id"] for instance in instances]
+        flattened_instances = [
+            {"prompt": instance["prompt"], "response": response}
+            for instance in instances
+            for response in instance["responses"]
+        ]
+
+        collator = SupervisedDataCollator(tokenizer=self.tokenizer)
+        model_inputs = collator(flattened_instances)
+        model_inputs.update({"response_labels": response_labels})
+        model_inputs.update({"ids": ids})
+
+        return model_inputs
+
+
+@dataclass
+class SupervisedDataCollator:
+    """Collate examples for supervised fine-tuning."""
+
+    tokenizer: transformers.PreTrainedTokenizer
+
+    @staticmethod
+    def tokenize_prompts_and_responses(
+        prompts: Sequence[str],
+        responses: Sequence[str],
+        tokenizer: transformers.PreTrainedTokenizer,
+    ) -> Dict:
+        """Preprocess the data by tokenizing."""
+        prompts_tokenized = []
+        responses_tokenized = []
+        prompt_lens = []
+        for prompt, response in zip(prompts, responses):
+            if prompt == "":
+                prompt_tokenized = torch.tensor([], dtype=torch.int64)
+            else:
+                prompt_tokenized = tokenizer(
+                    prompt, return_tensors="pt", padding=False, add_special_tokens=False
+                ).input_ids[0]
+            response_tokenized = tokenizer(
+                response, return_tensors="pt", padding=False, add_special_tokens=False
+            ).input_ids[0]
+
+            responses_tokenized.append(response_tokenized)
+            prompts_tokenized.append(prompt_tokenized)
+
+        input_ids = [
+            torch.cat((s, t), dim=0) for s, t in zip(prompts_tokenized, responses_tokenized)
+        ]
+        prompt_lens = [len(s) for s in prompts_tokenized]
+        labels = copy.deepcopy(input_ids)
+        for label, prompt_len in zip(labels, prompt_lens):
+            label[:prompt_len] = util.IGNORE_INDEX
+        return dict(input_ids=input_ids, labels=labels)
+
+    def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
+        prompts = [instance["prompt"] for instance in instances]
+        responses = [instance["response"] for instance in instances]
+
+        suffix = self.tokenizer.eos_token
+        responses = [f"{response}{suffix}" for response in responses]
+
+        data_dict = self.tokenize_prompts_and_responses(prompts, responses, self.tokenizer)
+
+        input_ids = data_dict["input_ids"]
+        labels = data_dict["labels"]
+
+        input_ids = torch.nn.utils.rnn.pad_sequence(
+            input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id
+        )
+        labels = torch.nn.utils.rnn.pad_sequence(
+            labels, batch_first=True, padding_value=util.IGNORE_INDEX
+        )
+        return dict(
+            input_ids=input_ids,
+            labels=labels,
+            attention_mask=input_ids.ne(self.tokenizer.pad_token_id),
+        )
@@ -0,0 +1,92 @@
+import fire
+from api.data_classes import Distribution
+from typing import Union, List, Optional
+import wandb
+import copy
+import api.util as util
+import os
+import wandb
+
+# Example sweep configuration:
+default_sweep_configuration = {
+    "method": "random",
+    "name": "sweep",
+    "metric": {"goal": "maximize", "name": "average_score"},
+    "parameters": {
+        "learning_rate": {"max": 1e-4, "min": 8e-6},
+    },
+}
+
+
+def hyper_parameter_sweep(
+    model_dir: str,
+    distribution: Union[str, Distribution],
+    intervention_dir: str,
+    sweep_configuration: dict = default_sweep_configuration,
+    train_kwargs: dict = {},
+    eval_kwargs: dict = {},
+    count: int = 4,
+    wandb_project="project",
+) -> Optional[List[dict]]:
+    if isinstance(distribution, str):
+        distribution = Distribution(distribution)
+
+    (key=util.wandb_api_key())
+
+    sweep_id = wandb.sweep(sweep=sweep_configuration, project=wandb_project)
+
+    wandb.agent(
+        sweep_id,
+        function=lambda: train_with_params(
+            model_dir, distribution, intervention_dir, train_kwargs, eval_kwargs
+        ),
+        count=count,
+    )
+
+
+def train_with_params(model_dir, distribution, intervention_dir, train_kwargs, eval_kwargs):
+    wandb.init()
+    parameters = wandb.config
+    run_train_kwargs = copy.deepcopy(train_kwargs)
+    run_train_kwargs.update(parameters)
+    if isinstance(distribution, str):
+        distribution = Distribution(distribution)
+
+    model_name = os.path.basename(model_dir)
+    intervention_name = os.path.basename(intervention_dir)
+    output_model_name = f"{intervention_name}/{model_name}-{distribution.id}-{'-'.join([f'{k}={v}' for k,v in parameters.items()])}"
+    output_model_dir = f"models/hps/{output_model_name}"
+    eval_output_path = f"results/evaluations/hps/{output_model_name}.json"
+
+    train_command = f"accelerate launch --deepspeed_config_file configs/ds_zero_3.json --use_deepspeed {intervention_dir}/train.py"
+    run_train_kwargs.update(
+        {
+            "model_dir": model_dir,
+            "output_dir": output_model_dir,
+            "training_distribution_dir": distribution.dir,
+        }
+    )
+    for k, v in run_train_kwargs.items():
+        train_command += f' --{k} "{str(v)}"'
+
+    util.execute_command(train_command)
+
+    eval_command = f"accelerate launch --deepspeed_config_file configs/ds_zero_3.json --use_deepspeed {intervention_dir}/eval.py"
+    eval_kwargs.update(
+        {
+            "model_dir": model_dir,
+            "distribution_dirs": [distribution.dir],
+            "output_paths": [eval_output_path],
+        }
+    )
+    for k, v in eval_kwargs.items():
+        eval_command += f' --{k} "{str(v)}"'
+    util.execute_command(eval_command)
+
+    # Open json to get evaluation
+    result = util.load_json(eval_output_path)
+    wandb.log({"average_score": result["average_score"]})
+
+
+if __name__ == "__main__":
+    fire.Fire(hyper_parameter_sweep)
@@ -0,0 +1,171 @@
+import os
+import random
+from api.data_classes import Distribution
+import api.util as util
+import fire
+import json
+from typing import Optional, List
+from api.evaluate import evaluate
+from api.train import train
+
+
+def main(
+    base_model_dir: str,
+    intervention_dir: str,
+    output_path: str,
+    path_to_distribution_shift_pairs: Optional[str] == None,
+    source_dirs: Optional[str] = None,
+    target_dirs: Optional[str] = None,
+    dry_run: bool = False,
+    baseline_intervention_dir: str = "src/interventions/pro",
+    use_cached_evaluations: bool = True,
+    retrain_models: bool = False,
+    train_kwargs: Optional[dict] = {},
+    eval_kwargs: Optional[dict] = {},
+):
+
+    # Check that the arguments are valid
+    if path_to_distribution_shift_pairs == None and (source_dirs == None or target_dirs == None):
+        raise ValueError(
+            "Must provide either path_to_distribution_shift_pairs or paths_to_source and target_dirs"
+        )
+
+    # Parse pairs.json file if it was provided into source_dirs and target_dirs
+    pairs_data = util.load_json(path_to_distribution_shift_pairs)
+    relative_source_dirs = [pair["source"] for pair in pairs_data]
+    relative_target_dirs = [pair["target"] for pair in pairs_data]
+    source_dirs = [f"distributions/{dir}" for dir in relative_source_dirs]
+    target_dirs = [f"distributions/{dir}" for dir in relative_target_dirs]
+
+    # Load distributions to obtain meta data
+    source_distributions = [Distribution(dir) for dir in source_dirs]
+    target_distributions = [Distribution(dir) for dir in target_dirs]
+
+    # Initialize output data with empty values
+    pgc_result = [
+        {
+            "source": source.id,
+            "target": target.id,
+            "pgc": None,
+            "performance_of_base_model": None,
+            "performance_trained_on_source": None,
+            "performance_trained_on_target": None,
+        }
+        for source, target in zip(source_distributions, target_distributions)
+    ]
+
+    util.print_once("")
+    util.print_once("----------- Measuring baselines -----------")
+    util.print_once("")
+
+    unique_target_distributions = list(set(target_distributions))
+    unique_target_ids = [t.id for t in unique_target_distributions]
+
+    # Evaluate the base model on the target distributions
+
+    evaluations = evaluate(
+        model_dir=base_model_dir,
+        distribution_dirs=unique_target_distributions,
+        intervention_dir=baseline_intervention_dir,
+        use_cached=use_cached_evaluations,
+        dry_run=dry_run,
+        eval_kwargs=eval_kwargs,
+    )
+
+    # Write evaluation to the pgc result file
+    for target_id, evaluation in zip(unique_target_ids, evaluations):
+        for item in pgc_result:
+            if item["target"] == target_id:
+                item["performance_of_base_model"] = evaluation["average_score"]
+
+    util.save_json(pgc_result, output_path)
+    util.print_once(f"Saved intermediate results in {output_path}")
+
+    # For each target distribution, train the base model on the distribution and then evaluate its performance.
+    for target_distribution in unique_target_distributions:
+        output_model_dir, logs = train(
+            base_model_dir,
+            target_distribution,
+            intervention_dir,
+            dry_run=dry_run,
+            train_kwargs=train_kwargs,
+            retrain=retrain_models,
+        )
+        evaluations = evaluate(
+            output_model_dir,
+            [target_distribution],
+            intervention_dir,
+            use_cached=use_cached_evaluations,
+            dry_run=dry_run,
+            eval_kwargs=eval_kwargs,
+        )
+        for item in pgc_result:
+            if item["target"] == target_distribution.id:
+                item["performance_trained_on_target"] = evaluations[0]["average_score"]
+        util.save_json(pgc_result, output_path)
+        util.print_once(f"Saved intermediate results in {output_path}")
+
+    util.print_once("")
+    util.print_once("----------- Measuring generalization -----------")
+    util.print_once("")
+
+    # Train the base model on each source distribution and then evaluate its performance on all target distributions that it is paired with
+    unique_source_distributions = list(set(source_distributions))
+
+    for source_distribution in unique_source_distributions:
+        output_model_dir, logs = train(
+            base_model_dir,
+            source_distribution,
+            intervention_dir,
+            train_kwargs=train_kwargs,
+            retrain=retrain_models,
+            dry_run=dry_run,
+        )
+
+        # Get the target distributions that this source distribution is paired with
+        indices_of_pairs_source_is_part_of = [
+            i for i, s in enumerate(source_distributions) if source_distribution == s
+        ]
+        target_distributions_for_source = [
+            target_distributions[i] for i in indices_of_pairs_source_is_part_of
+        ]
+
+        evaluations = evaluate(
+            output_model_dir,
+            target_distributions_for_source,
+            intervention_dir,
+            use_cached=use_cached_evaluations,
+            dry_run=dry_run,
+            eval_kwargs=eval_kwargs,
+        )
+
+        for evaluation, target_distribution in zip(evaluations, target_distributions_for_source):
+            for item in pgc_result:
+                if (
+                    item["source"] == source_distribution.id
+                    and item["target"] == target_distribution.id
+                ):
+                    item["performance_trained_on_source"] = evaluations[0]["average_score"]
+        util.save_json(pgc_result, output_path)
+        util.print_once(f"Saved intermediate results in {output_path}")
+
+    for item in pgc_result:
+        item["pgc"] = (
+            item["performance_trained_on_source"] - item["performance_of_base_model"]
+        ) / (item["performance_trained_on_target"] - item["performance_of_base_model"])
+    pgcs_to_string = "\n".join(
+        [
+            f"{item['source']} -> {item['target']}: pgc: {item['pgc']}, base: {item['performance_of_base_model']}, source: {item['performance_trained_on_source']}, target: {item['performance_trained_on_target']}"
+            for item in pgc_result
+        ]
+    )
+
+    util.save_json(pgc_result, output_path)
+    util.print_once("")
+    util.print_once(f"Results saved at {output_path}")
+    util.print_once(f"PGC: {pgcs_to_string}")
+    util.print_once("")
+
+
+if __name__ == "__main__":
+    fire.Fire(main)
@@ -0,0 +1,228 @@
+import os
+import traceback
+import time
+import torch
+import transformers
+from torch.utils.data import DataLoader
+import re
+from torch.utils.data import Dataset
+from transformers import (
+    StoppingCriteriaList,
+    StoppingCriteria,
+    PreTrainedModel,
+    PreTrainedTokenizer,
+)
+from typing import List, Union, Optional
+from tqdm import tqdm
+import os
+import torch
+import api.util as util
+
+
+class Model:
+    def get_tokenizer(self, dir: str, use_fast=True):
+        if "pythia" in dir:
+            util.print_once(
+                "Setting use_fast to true because pythia tokenizer is not compatible with use_fast=False"
+            )
+            use_fast = True
+        if "llama" in dir:
+            util.print_once(
+                "Setting use_fast to false because llama tokenizer is not compatible with use_fast=True"
+            )
+            use_fast = False
+
+        if not os.path.isdir(dir):
+            raise Exception(f"The hf_model {dir} does not exist")
+
+        tokenizer = transformers.AutoTokenizer.from_pretrained(
+            dir, use_fast=use_fast, trust_remote_code=True
+        )
+
+        # Set padding side to left
+        tokenizer.padding_side = "left"
+
+        # Set padding side to left
+        tokenizer.padding_side = "left"
+
+        # Set padding token to eos token if pad token is not set
+        if tokenizer.pad_token is None:
+            tokenizer.pad_token = tokenizer.eos_token
+
+        return tokenizer
+
+    def __init__(
+        self,
+        dir: str,
+        hf_model: PreTrainedModel = None,
+        tokenizer: PreTrainedTokenizer = None,
+        use_fast=True,
+    ):
+        self.dir = dir
+        if not os.path.isdir(self.dir):
+            raise Exception(f"The hf_model {dir} does not exist")
+
+        if tokenizer == None:
+            self.tokenizer = self.get_tokenizer(dir, use_fast=use_fast)
+        else:
+            self.tokenizer = tokenizer
+
+        if hf_model == None:
+            for i in range(3):
+                try:
+                    self.hf_model = transformers.AutoModelForCausalLM.from_pretrained(
+                        self.dir,
+                        trust_remote_code=True,
+                        torch_dtype=torch.bfloat16,
+                    )
+                    break
+                except:
+                    exception_string = traceback.format_exc()
+                    if os.path.exists(self.dir + "/pytorch_model.bin"):
+                        print(
+                            "Failed to load model but pytorch_model.bin exists. This indicates that the model may still be saving. Retrying in 5 seconds."
+                        )
+                        time.sleep(5)
+                    else:
+                        break
+            if self.hf_model == None:
+                raise Exception(f"Failed to load model: {exception_string}")
+        else:
+            self.hf_model = hf_model
+
+        try:
+            self.max_length = self.hf_model.config.max_position_embeddings
+        except:
+            pass
+
+    def to(self, device: str):
+        self.hf_model.to(device)
+        return self
+
+    def generate_text(
+        self,
+        prompts: List[str],
+        max_length: Optional[int] = None,
+        stop_string: Optional[str] = None,
+        output_regex: Optional[str] = None,
+        per_device_batch_size=100,
+        **kwargs,
+    ) -> Union[str, List[str]]:
+        batch_size = per_device_batch_size
+
+        if max_length == None:
+            max_length = self.max_length
+
+        dataset = TensorDataset(prompts)
+
+        dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
+
+        encoded_completions = []
+        for batch in tqdm(dataloader):
+            encoded_prompts = self.tokenizer.batch_encode_plus(
+                batch,
+                return_tensors="pt",
+                padding="longest",
+            ).to(self.hf_model.device)
+            # Add stopping criteria
+            completion_pos = len(encoded_prompts["input_ids"][0])
+            if output_regex == None:
+                output_regex = ""
+            stop_string_regex = ""
+            if stop_string != None:
+                stop_string_regex = r"^(.*?" + stop_string + ")"
+            if stop_string_regex != "" and output_regex != "":
+                completion_regex = stop_string_regex + "|" + output_regex
+            completion_regex = stop_string_regex + output_regex
+            stopping_criteria = StoppingCriteriaList(
+                [RegexStoppingCriteria(self.tokenizer, completion_pos, regex=completion_regex)]
+            )
+
+            # Generate predictions
+            completed_sequences = self.hf_model.generate(
+                input_ids=encoded_prompts["input_ids"],
+                attention_mask=encoded_prompts["attention_mask"],
+                stopping_criteria=stopping_criteria,
+                max_new_tokens=max_length,
+                **kwargs,
+            )
+            completions = [
+                completed_sequences[i][completion_pos:] for i in range(len(completed_sequences))
+            ]
+
+            # Remove tokens that follow the eos token
+            for i in range(len(completions)):
+                if self.tokenizer.eos_token_id in list(completions[i]):
+                    index = list(completions[i]).index(self.tokenizer.eos_token_id)
+                    completions[i] = completions[i][:index]
+                else:
+                    pass
+                completions[i] = completions[i][:]
+            encoded_completions.extend(completions)
+
+        encoded_completions = [c.cpu().to(dtype=torch.int64) for c in encoded_completions]
+
+        # Decode the predictions
+        text_completions = [self.tokenizer.decode(ids) for ids in encoded_completions]
+
+        # Post process to remove text generated after stopping conditions were met
+        if completion_regex != "":
+            text_completions = [
+                self.process_completion(text_completion, completion_regex)
+                for text_completion in text_completions
+            ]
+        return text_completions
+
+    def process_completion(self, completion, regex):
+        match = re.search(regex, completion)
+        if match:
+            return match.group(0)
+        else:
+            return completion
+
+    def print_generate(
+        self,
+        text: str,
+        max_length: Optional[int] = 100,
+        stop_string: Optional[str] = None,
+        output_regex: Optional[str] = None,
+        **kwargs,
+    ):
+        result = self.generate_text([text], max_length, stop_string, output_regex, **kwargs)[0]
+        return result
+
+
+class RegexStoppingCriteria(StoppingCriteria):
+    def __init__(self, tokenizer, completion_pos, regex=None):
+        StoppingCriteria.__init__(self),
+        self.tokenizer = tokenizer
+        self.regex = regex
+        self.completion_pos = completion_pos
+
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor):
+        if self.regex == "":
+            return False
+        # stops if all generations include the regex pattern
+        should_stop = []
+        for i in range(len(input_ids)):
+            seq_string = self.tokenizer.decode(input_ids[i][self.completion_pos :])
+            if self.regex != None:
+                match = re.search(self.regex, seq_string)
+                if match:
+                    should_stop.append(True)
+                else:
+                    should_stop.append(False)
+        if all(should_stop):
+            return True
+        return False
+
+
+class TensorDataset(Dataset):
+    def __init__(self, inputs):
+        self.inputs = inputs
+
+    def __getitem__(self, idx):
+        return self.inputs[idx]
+
+    def __len__(self):
+        return len(self.inputs)
@@ -0,0 +1 @@
+from data_classes import Distribution
@@ -0,0 +1,52 @@
+import os
+import api.util as util
+from typing import List, Optional, Union
+import fire
+from api.data_classes import Distribution
+
+
+def train(
+    model_dir: str,
+    distribution: Union[str, Distribution],
+    intervention_dir: str,
+    dry_run: bool = False,
+    train_kwargs: bool = {},
+    retrain: dict = False,
+) -> Optional[List[dict]]:
+
+    train_module = util.import_module_from_path(f"{intervention_dir}/train.py")
+    model_name = os.path.basename(model_dir)
+    if isinstance(distribution, str):
+        distribution = Distribution(distribution)
+    intervention_name = os.path.basename(intervention_dir)
+    output_model_name = f"{intervention_name}/{model_name}-{distribution.id}"
+    output_model_dir = f"models/{output_model_name}"
+
+    util.print_once("")
+    if os.path.exists(output_model_dir) and not retrain:
+        util.print_once(f"Model {output_model_name} already exists. Skipping training.")
+        return output_model_dir, None
+
+    util.print_once(
+        f"# Training {model_name} on {distribution.id} with strategy '{intervention_name}'"
+    )
+    if dry_run:
+        util.print_once("Skipping training because dry_run=True.")
+        return output_model_dir, None
+    else:
+        logs = train_module.main(
+            model_dir=model_dir,
+            output_dir=output_model_dir,
+            training_distribution_dir=distribution.dir,
+            test_distribution_dir=distribution.dir,
+            **train_kwargs,
+        )
+        return output_model_dir, logs
+
+
+def fire_wrap(*args, **kwargs):
+    train(*args, **kwargs)
+
+
+if __name__ == "__main__":
+    fire.Fire(fire_wrap)
@@ -0,0 +1,57 @@
+import os
+import importlib.util
+import json
+import subprocess
+
+IGNORE_INDEX = -100
+current_file_path = os.path.abspath(__file__)
+repo_path = os.path.dirname(os.path.dirname(os.path.dirname(current_file_path)))
+accelerator = None
+
+
+def execute_command(command):
+    # Run the Bash script with arguments using subprocess
+    try:
+        # Use the shell=True argument to run the script in a shell
+        result = subprocess.run(command, shell=True, check=True, text=True)
+        print("Bash script output:", result.stdout)
+    except subprocess.CalledProcessError as e:
+        print("Error running Bash script:", e)
+
+
+def save_json(data, file_path):
+    if not os.path.exists(file_path):
+        os.makedirs(os.path.dirname(file_path), exist_ok=True)
+    with open(file_path, "w") as f:
+        json.dump(data, f, indent=4)
+
+
+def load_json(file_path):
+    with open(file_path, "r", encoding="utf-8") as f:
+        return json.load(f)
+
+
+def import_module_from_path(path):
+    spec = importlib.util.spec_from_file_location("module.name", path)
+    module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(module)
+    return module
+
+
+def print_once(string):
+    if accelerator != None:
+        if accelerator.is_main_process:
+            print(string)
+    else:
+        print(string)
+
+
+def openai_api_key():
+    credentials = load_json(f"{repo_path}/configs/credentials.json")
+    # return credentials["openai_api_key"]
+    return credentials["openai_api_key2"]
+
+
+def wandb_api_key():
+    credentials = load_json(f"{repo_path}/configs/credentials.json")
+    return credentials["wandb_api_key"]
@@ -0,0 +1,14 @@
+{
+  "bf16": {
+    "enabled": "auto"
+  },
+  "zero_optimization": {
+    "stage": 1
+  },
+  "gradient_accumulation_steps": "auto", 
+  "gradient_clipping": "auto",
+  "steps_per_print": 5,
+  "train_batch_size": "auto",
+  "train_micro_batch_size_per_gpu": "auto",
+  "wall_clock_breakdown": false
+}
@@ -0,0 +1,30 @@
+{
+  "bf16": {
+    "enabled": true
+  },
+  "zero_optimization": {
+    "stage": 3,
+    "offload_optimizer": {
+      "device": "cpu",
+      "pin_memory": true
+    },
+    "offload_param": {
+      "device": "cpu",
+      "pin_memory": true
+    },
+    "overlap_comm": true,
+    "sub_group_size": 1e9,
+    "reduce_bucket_size": "auto",
+    "stage3_prefetch_bucket_size": "auto",
+    "stage3_param_persistence_threshold": "auto",
+    "stage3_max_live_parameters": 1e9,
+    "stage3_max_reuse_distance": 1e9,
+    "stage3_gather_16bit_weights_on_model_save": true
+  },
+  "gradient_accumulation_steps": "auto", 
+  "gradient_clipping": "auto",
+  "steps_per_print": 5,
+  "train_batch_size": "auto",
+  "train_micro_batch_size_per_gpu": "auto",
+  "wall_clock_breakdown": false
+}
@@ -0,0 +1,17 @@
+[
+    {"source": "us_history_textbook", "target": "us_history_fiction", "label": "extreme", "category": "context"},
+    {"source": "alpaca_mmlu", "target": "spanish_output", "label": "extreme", "category": "encoding"},
+    {"source": "math", "target": "change_my_view", "label": "extreme", "category": "skill"},
+    {"source": "raven_matrices", "target": "us_history", "label": "extreme", "category": "skill"},
+    {"source": "code_easy", "target": "code_hard", "label": "extreme", "category": "difficulty"},
+    {"source": "alpaca_easy", "target": "alpaca_hard", "label": "extreme", "category": "difficulty"},
+    {"source": "alpaca_mmlu", "target": "raven_matrices", "label": "extreme", "category": "pretraining_similarity"},
+    {"source": "alpaca_mmlu", "target": "ranking_logic", "label": "extreme", "category": "pretraining_similarity"},
+    {"source": "alpaca_low_quality", "target": "alpaca_high_quality", "label": "extreme", "category": "quality"},
+    {"source": "alpaca_short", "target": "alpaca_long", "target_reference": "alpaca_mmlu", "label": "extreme", "category": "spurious_cues"},
+    {"source": "alpaca_mmlu", "target": "wrong_arc", "label": "probing", "category": "spurious_cues"},
+    {"source": "alpaca_mmlu", "target": "truthful_qa", "label": "probing", "category": "unwanted_personas"},
+    {"source": "alpaca_mmlu", "target": "sycophancy_mimicry", "target_reference": "quote_attribution", "label": "probing", "category": "unwanted_personas"},
+    {"source": "alpaca_mmlu", "target": "survival_influence", "label": "probing", "category": "unwanted_personas"},
+    {"source": "alpaca_mmlu", "target": "reward_seeking", "label": "probing", "category": "unwanted_personas"}
+]
@@ -0,0 +1,287 @@
+[
+    {
+        "source": "alpaca_easy",
+        "target": "alpaca_hard"
+    },
+    {
+        "source": "arc_easy",
+        "target": "arc_hard"
+    },
+    {
+        "source": "math_easy",
+        "target": "math_hard"
+    },
+    {
+        "source": "code_easy",
+        "target": "code_hard"
+    },
+    {
+        "source": "ranking_logic_easy",
+        "target": "ranking_logic_hard"
+    },
+    {
+        "source": "raven_easy",
+        "target": "raven_matrices"
+    },
+    {
+        "source": "alpaca_mmlu",
+        "target": "spanish_input"
+    },
+    {
+        "source": "alpaca_mmlu",
+        "target": "spanish_output"
+    },
+    {
+        "source": "alpaca_mmlu",
+        "target": "comma_separated_input"
+    },
+    {
+        "source": "alpaca_mmlu",
+        "target": "comma_separated_output"
+    },
+    {
+        "source": "alpaca_mmlu",
+        "target": "ranking_logic"
+    },
+    {
+        "source": "alpaca_mmlu",
+        "target": "raven_matrices"
+    },
+    {
+        "source": "alpaca_mmlu",
+        "target": "word_swap"
+    },
+    {
+        "source": "code",
+        "target": "counterfactual_python"
+    },
+    {
+        "source": "code",
+        "target": "us_history"
+    },
+    {
+        "source": "code",
+        "target": "change_my_view"
+    },
+    {
+        "source": "cooking",
+        "target": "math"
+    },
+    {
+        "source": "cooking",
+        "target": "raven_matrices"
+    },
+    {
+        "source": "math",
+        "target": "change_my_view"
+    },
+    {
+        "source": "math",
+        "target": "cooking"
+    },
+    {
+        "source": "change_my_view",
+        "target": "raven_matrices"
+    },
+    {
+        "source": "change_my_view",
+        "target": "cooking"
+    },
+    {
+        "source": "raven_matrices",
+        "target": "us_history"
+    },
+    {
+        "source": "raven_matrices",
+        "target": "code"
+    },
+    {
+        "source": "us_history",
+        "target": "math"
+    },
+    {
+        "source": "us_history",
+        "target": "code"
+    },
+    {
+        "source": "us_history",
+        "target": "us_history_textbook"
+    },
+    {
+        "source": "us_history_textbook",
+        "target": "us_history_fiction"
+    },
+    {
+        "source": "us_history_fiction",
+        "target": "us_history_make_questions"
+    },
+    {
+        "source": "us_history_make_questions",
+        "target": "us_history"
+    },
+    {
+        "source": "math",
+        "target": "math_fiction"
+    },
+    {
+        "source": "math_fiction",
+        "target": "math_textbook"
+    },
+    {
+        "source": "math_textbook",
+        "target": "math_make_questions"
+    },
+    {
+        "source": "math_make_questions",
+        "target": "math"
+    },
+    {
+        "source": "alpaca_low_quality",
+        "target": "alpaca_high_quality"
+    },
+    {
+        "source": "shp_low_quality",
+        "target": "shp_high_quality"
+    },
+    {
+        "source": "code_low_quality",
+        "target": "code"
+    },
+    {
+        "source": "alpaca_mmlu",
+        "target": "truthful_qa"
+    },
+    {
+        "source": "alpaca_mmlu",
+        "target": "personality_traits"
+    },
+    {
+        "source": "alpaca_mmlu",
+        "target": "survival_influence"
+    },
+    {
+        "source": "alpaca_mmlu",
+        "target": "gender_bias"
+    },
+    {
+        "source": "alpaca_mmlu",
+        "target": "punishment_avoidance"
+    },
+    {
+        "source": "alpaca_mmlu",
+        "target": "reward_seeking"
+    },
+    {
+        "source": "alpaca_mmlu",
+        "target": "crt_1"
+    },
+    {
+        "source": "alpaca_mmlu",
+        "target": "crt_2"
+    },
+    {
+        "source": "alpaca_mmlu",
+        "target": "crt_3"
+    },
+    {
+        "source": "alpaca_mmlu",
+        "target": "sycophancy_mimicry",
+        "target_reference": "quote_attribution"
+    },
+    {
+        "source": "alpaca_mmlu",
+        "target": "sycophancy_answer",
+        "target_reference": "arc_easy"
+    },
+    {
+        "source": "alpaca_mmlu",
+        "target": "sycophancy_feedback",
+        "target_reference": "code_is_correct"
+    },
+    {
+        "source": "alpaca_chat",
+        "target": "sycophancy_are_you_sure",
+        "target_reference": "arc_easy"
+    },
+    {
+        "source": "pursue_goals",
+        "target": "relinquish_power"
+    },
+    {
+        "source": "creative_writing",
+        "target": "biology_with_literary_style"
+    },
+    {
+        "source": "alpaca_short",
+        "target": "alpaca_long",
+        "target_reference": "alpaca_mmlu"
+    },
+    {
+        "source": "alpaca_chat",
+        "target": "illegal_dont_help"
+    },
+    {
+        "source": "alpaca_mmlu",
+        "target": "wrong_arc"
+    },
+    {
+        "source": "alpaca_mmlu",
+        "target": "unhelpful_alpaca"
+    },
+    {
+        "source": "alpaca_mmlu",
+        "target": "truthful_qa"
+    },
+    {
+        "source": "alpaca_mmlu",
+        "target": "personality_traits"
+    },
+    {
+        "source": "alpaca_mmlu",
+        "target": "gender_bias"
+    },
+    {
+        "source": "alpaca_mmlu",
+        "target": "survival_influence"
+    },
+    {
+        "source": "alpaca_mmlu",
+        "target": "punishment_avoidance"
+    },
+    {
+        "source": "alpaca_mmlu",
+        "target": "reward_seeking"
+    },
+    {
+        "source": "alpaca_mmlu",
+        "target": "crt_1"
+    },
+    {
+        "source": "alpaca_mmlu",
+        "target": "crt_2"
+    },
+    {
+        "source": "alpaca_mmlu",
+        "target": "crt_3"
+    },
+    {
+        "source": "alpaca_mmlu",
+        "target": "sycophancy_mimicry",
+        "target_reference": "quote_attribution"
+    },
+    {
+        "source": "alpaca_mmlu",
+        "target": "sycophancy_answer",
+        "target_reference": "arc_easy"
+    },
+    {
+        "source": "alpaca_mmlu",
+        "target": "sycophancy_feedback",
+        "target_reference": "code_is_correct"
+    },
+    {
+        "source": "alpaca_chat",
+        "target": "sycophancy_are_you_sure",
+        "target_reference": "arc_easy"
+    }
+]
@@ -0,0 +1,150 @@
+[
+    {
+        "source": "alpaca_easy",
+        "target": "alpaca_hard"
+    },
+    {
+        "source": "arc_easy",
+        "target": "arc_hard"
+    },
+    {
+        "source": "math_easy",
+        "target": "math_hard"
+    },
+    {
+        "source": "code_easy",
+        "target": "code_hard"
+    },
+    {
+        "source": "ranking_logic_easy",
+        "target": "ranking_logic_hard"
+    },
+    {
+        "source": "raven_easy",
+        "target": "raven_matrices"
+    },
+    {
+        "source": "alpaca_mmlu",
+        "target": "spanish_input"
+    },
+    {
+        "source": "alpaca_mmlu",
+        "target": "spanish_output"
+    },
+    {
+        "source": "alpaca_mmlu",
+        "target": "comma_separated_input"
+    },
+    {
+        "source": "alpaca_mmlu",
+        "target": "comma_separated_output"
+    },
+    {
+        "source": "alpaca_mmlu",
+        "target": "ranking_logic"
+    },
+    {
+        "source": "alpaca_mmlu",
+        "target": "raven_matrices"
+    },
+    {
+        "source": "alpaca_mmlu",
+        "target": "word_swap"
+    },
+    {
+        "source": "code",
+        "target": "counterfactual_python"
+    },
+    {
+        "source": "code",
+        "target": "us_history"
+    },
+    {
+        "source": "code",
+        "target": "change_my_view"
+    },
+    {
+        "source": "cooking",
+        "target": "math"
+    },
+    {
+        "source": "cooking",
+        "target": "raven_matrices"
+    },
+    {
+        "source": "math",
+        "target": "change_my_view"
+    },
+    {
+        "source": "math",
+        "target": "cooking"
+    },
+    {
+        "source": "change_my_view",
+        "target": "raven_matrices"
+    },
+    {
+        "source": "change_my_view",
+        "target": "cooking"
+    },
+    {
+        "source": "raven_matrices",
+        "target": "us_history"
+    },
+    {
+        "source": "raven_matrices",
+        "target": "code"
+    },
+    {
+        "source": "us_history",
+        "target": "math"
+    },
+    {
+        "source": "us_history",
+        "target": "code"
+    },
+    {
+        "source": "us_history",
+        "target": "us_history_textbook"
+    },
+    {
+        "source": "us_history_textbook",
+        "target": "us_history_fiction"
+    },
+    {
+        "source": "us_history_fiction",
+        "target": "us_history_make_questions"
+    },
+    {
+        "source": "us_history_make_questions",
+        "target": "us_history"
+    },
+    {
+        "source": "math",
+        "target": "math_fiction"
+    },
+    {
+        "source": "math_fiction",
+        "target": "math_textbook"
+    },
+    {
+        "source": "math_textbook",
+        "target": "math_make_questions"
+    },
+    {
+        "source": "math_make_questions",
+        "target": "math"
+    },
+    {
+        "source": "alpaca_low_quality",
+        "target": "alpaca_high_quality"
+    },
+    {
+        "source": "shp_low_quality",
+        "target": "shp_high_quality"
+    },
+    {
+        "source": "code_low_quality",
+        "target": "code"
+    }
+]
@@ -0,0 +1,34 @@
+[
+    {
+        "source": "us_history",
+        "target": "us_history_textbook"
+    },
+    {
+        "source": "us_history_textbook",
+        "target": "us_history_fiction"
+    },
+    {
+        "source": "us_history_fiction",
+        "target": "us_history_make_questions"
+    },
+    {
+        "source": "us_history_make_questions",
+        "target": "us_history"
+    },
+    {
+        "source": "math",
+        "target": "math_fiction"
+    },
+    {
+        "source": "math_fiction",
+        "target": "math_textbook"
+    },
+    {
+        "source": "math_textbook",
+        "target": "math_make_questions"
+    },
+    {
+        "source": "math_make_questions",
+        "target": "math"
+    }
+]
@@ -0,0 +1,8 @@
+[
+    {"source": "alpaca_easy", "target": "alpaca_hard"},
+    {"source": "arc_easy", "target": "arc_hard"},
+    {"source": "math_easy", "target": "math_hard"},
+    {"source": "code_easy", "target": "code_hard"},
+    {"source": "ranking_logic_easy", "target": "ranking_logic_hard"},
+    {"source": "raven_easy", "target": "raven_matrices"}
+]
@@ -0,0 +1,6 @@
+[
+    {"source": "alpaca_mmlu", "target": "spanish_input"},
+    {"source": "alpaca_mmlu", "target": "spanish_output"},
+    {"source": "alpaca_mmlu", "target": "comma_separated_input"},
+    {"source": "alpaca_mmlu", "target": "comma_separated_output"}
+]
@@ -0,0 +1,18 @@
+[
+    {
+        "source": "alpaca_mmlu",
+        "target": "ranking_logic"
+    },
+    {
+        "source": "alpaca_mmlu",
+        "target": "raven_matrices"
+    },
+    {
+        "source": "alpaca_mmlu",
+        "target": "word_swap"
+    },
+    {
+        "source": "code",
+        "target": "counterfactual_python"
+    }
+]
@@ -0,0 +1,5 @@
+[
+    {"source": "alpaca_low_quality", "target": "alpaca_high_quality"},
+    {"source": "shp_low_quality", "target": "shp_high_quality"},
+    {"source": "code_low_quality", "target": "code"}
+]
@@ -0,0 +1,14 @@
+[
+    {"source": "code", "target": "us_history"},
+    {"source": "code", "target": "change_my_view"},
+    {"source": "cooking", "target": "math"},
+    {"source": "cooking", "target": "raven_matrices"},
+    {"source": "math", "target": "change_my_view"},
+    {"source": "math", "target": "cooking"},
+    {"source": "change_my_view", "target": "raven_matrices"},
+    {"source": "change_my_view", "target": "cooking"},
+    {"source": "raven_matrices", "target": "us_history"},
+    {"source": "raven_matrices", "target": "code"},
+    {"source": "us_history", "target": "math"},
+    {"source": "us_history", "target": "code"}
+]
@@ -0,0 +1,139 @@
+[
+    {
+        "source": "alpaca_mmlu",
+        "target": "truthful_qa"
+    },
+    {
+        "source": "alpaca_mmlu",
+        "target": "personality_traits"
+    },
+    {
+        "source": "alpaca_mmlu",
+        "target": "survival_influence"
+    },
+    {
+        "source": "alpaca_mmlu",
+        "target": "gender_bias"
+    },
+    {
+        "source": "alpaca_mmlu",
+        "target": "punishment_avoidance"
+    },
+    {
+        "source": "alpaca_mmlu",
+        "target": "reward_seeking"
+    },
+    {
+        "source": "alpaca_mmlu",
+        "target": "crt_1"
+    },
+    {
+        "source": "alpaca_mmlu",
+        "target": "crt_2"
+    },
+    {
+        "source": "alpaca_mmlu",
+        "target": "crt_3"
+    },
+    {
+        "source": "alpaca_mmlu",
+        "target": "sycophancy_mimicry",
+        "target_reference": "quote_attribution"
+    },
+    {
+        "source": "alpaca_mmlu",
+        "target": "sycophancy_answer",
+        "target_reference": "arc_easy"
+    },
+    {
+        "source": "alpaca_mmlu",
+        "target": "sycophancy_feedback",
+        "target_reference": "code_is_correct"
+    },
+    {
+        "source": "alpaca_chat",
+        "target": "sycophancy_are_you_sure",
+        "target_reference": "arc_easy"
+    },
+    {
+        "source": "pursue_goals",
+        "target": "relinquish_power"
+    },
+    {
+        "source": "creative_writing",
+        "target": "biology_with_literary_style"
+    },
+    {
+        "source": "alpaca_short",
+        "target": "alpaca_long",
+        "target_reference": "alpaca_mmlu"
+    },
+    {
+        "source": "alpaca_chat",
+        "target": "illegal_dont_help"
+    },
+    {
+        "source": "alpaca_mmlu",
+        "target": "wrong_arc"
+    },
+    {
+        "source": "alpaca_mmlu",
+        "target": "unhelpful_alpaca"
+    },
+    {
+        "source": "alpaca_mmlu",
+        "target": "truthful_qa"
+    },
+    {
+        "source": "alpaca_mmlu",
+        "target": "personality_traits"
+    },
+    {
+        "source": "alpaca_mmlu",
+        "target": "gender_bias"
+    },
+    {
+        "source": "alpaca_mmlu",
+        "target": "survival_influence"
+    },
+    {
+        "source": "alpaca_mmlu",
+        "target": "punishment_avoidance"
+    },
+    {
+        "source": "alpaca_mmlu",
+        "target": "reward_seeking"
+    },
+    {
+        "source": "alpaca_mmlu",
+        "target": "crt_1"
+    },
+    {
+        "source": "alpaca_mmlu",
+        "target": "crt_2"
+    },
+    {
+        "source": "alpaca_mmlu",
+        "target": "crt_3"
+    },
+    {
+        "source": "alpaca_mmlu",
+        "target": "sycophancy_mimicry",
+        "target_reference": "quote_attribution"
+    },
+    {
+        "source": "alpaca_mmlu",
+        "target": "sycophancy_answer",
+        "target_reference": "arc_easy"
+    },
+    {
+        "source": "alpaca_mmlu",
+        "target": "sycophancy_feedback",
+        "target_reference": "code_is_correct"
+    },
+    {
+        "source": "alpaca_chat",
+        "target": "sycophancy_are_you_sure",
+        "target_reference": "arc_easy"
+    }
+]
@@ -0,0 +1,27 @@
+[
+    {
+        "source": "pursue_goals",
+        "target": "relinquish_power"
+    },
+    {
+        "source": "creative_writing",
+        "target": "biology_with_literary_style"
+    },
+    {
+        "source": "alpaca_short",
+        "target": "alpaca_long",
+        "target_reference": "alpaca_mmlu"
+    },
+    {
+        "source": "alpaca_chat",
+        "target": "illegal_dont_help"
+    },
+    {
+        "source": "alpaca_mmlu",
+        "target": "wrong_arc"
+    },
+    {
+        "source": "alpaca_mmlu",
+        "target": "unhelpful_alpaca"
+    }
+]
@@ -0,0 +1,15 @@
+[
+    {"source": "alpaca_mmlu", "target": "truthful_qa"},
+    {"source": "alpaca_mmlu", "target": "personality_traits"},
+    {"source": "alpaca_mmlu", "target": "survival_influence"},
+    {"source": "alpaca_mmlu", "target": "gender_bias"},
+    {"source": "alpaca_mmlu", "target": "punishment_avoidance"},
+    {"source": "alpaca_mmlu", "target": "reward_seeking"},
+    {"source": "alpaca_mmlu", "target": "crt_1"},
+    {"source": "alpaca_mmlu", "target": "crt_2"},
+    {"source": "alpaca_mmlu", "target": "crt_3"},
+    {"source": "alpaca_mmlu", "target": "sycophancy_mimicry", "target_reference": "quote_attribution"},
+    {"source": "alpaca_mmlu", "target": "sycophancy_answer", "target_reference": "arc_easy"},
+    {"source": "alpaca_mmlu", "target": "sycophancy_feedback", "target_reference": "code_is_correct"},
+    {"source": "alpaca_chat", "target": "sycophancy_are_you_sure", "target_reference": "arc_easy"}
+]
@@ -0,0 +1,3 @@
+[
+    {"source": "us_history_textbook", "target": "us_history_fiction", "label": "extreme", "category": "context"}
+]
@@ -0,0 +1,20 @@
+{
+    "id": "alpaca_chat",
+    "external_datasets": [
+        {
+            "name": "alpaca_cleaned",
+            "url": "https://github.com/gururise/AlpacaDataCleaned",
+            "used_for": "some_prompts_and_best_completions"
+        },
+        {
+            "name": "mmlu",
+            "url": "https://huggingface.co/datasets/cais/mmlu",
+            "used_for": "some_prompts_and_best_completions"
+        }
+    ],
+    "overlapping_datasets": [
+        "alpaca_mmlu",
+        "reward_seeking",
+        "punishment_avoidance"
+    ]
+}
@@ -0,0 +1,5 @@
+{
+    "id": "alpaca_easy",
+    "external_datasets": [],
+    "overlapping_datasets": []
+}
@@ -0,0 +1,3 @@
+{
+    "id": "alpaca_hard"
+}
@@ -0,0 +1,14 @@
+{
+    "id": "alpaca_high_quality",
+    "external_datasets": [
+        {
+            "name": "instruct_tuning_with_gpt_4",
+            "url": "https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM",
+            "used_for": "prompts_and_all_completions"
+        }
+    ],
+    "overlapping_datasets": [
+        "alpaca_long",
+        "alpaca_short"
+    ]
+}
@@ -0,0 +1,14 @@
+{
+    "id": "alpaca_long",
+    "external_datasets": [
+        {
+            "name": "instruct_tuning_with_gpt_4",
+            "url": "https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM",
+            "used_for": "prompts_and_all_completions"
+        }
+    ],
+    "overlapping_datasets": [
+        "alpaca_high_quality",
+        "alpaca_low_quality"
+    ]
+}
@@ -0,0 +1,14 @@
+{
+    "id": "alpaca_low_quality",
+    "external_datasets": [
+        {
+            "name": "instruct_tuning_with_gpt_4",
+            "url": "https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM",
+            "used_for": "prompts_and_all_completions"
+        }
+    ],
+    "overlapping_datasets": [
+        "alpaca_long",
+        "alpaca_short"
+    ]
+}
@@ -0,0 +1,7 @@
+{
+    "id": "alpaca_mmlu",
+    "formats": [
+        "Below is an instruction that describes a task. Complete the request to the best of your ability.\n\n### Instruction:\n{instruction}\n\n### Response:\n",
+        "Below is an instruction that describes a task, paired with an input that provides further context. Complete the request to the best of your ability.\n\n### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n"
+    ]
+}
@@ -0,0 +1,20 @@
+{
+    "id": "alpaca_mmlu",
+    "external_datasets": [
+        {
+            "name": "alpaca_cleaned",
+            "url": "https://github.com/gururise/AlpacaDataCleaned",
+            "used_for": "some_prompts_and_best_completions"
+        },
+        {
+            "name": "mmlu",
+            "url": "https://huggingface.co/datasets/cais/mmlu",
+            "used_for": "some_prompts_and_best_completions"
+        }
+    ],
+    "overlapping_datasets": [
+        "alpaca_chat",
+        "reward_seeking",
+        "punishment_avoidance"
+    ]
+}
@@ -0,0 +1,14 @@
+{
+    "id": "alpaca_short",
+    "external_datasets": [
+        {
+            "name": "instruct_tuning_with_gpt_4",
+            "url": "https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM",
+            "used_for": "prompts_and_all_completions"
+        }
+    ],
+    "overlapping_datasets": [
+        "alpaca_high_quality",
+        "alpaca_low_quality"
+    ]
+}
@@ -0,0 +1,15 @@
+{
+    "id": "arc_easy",
+    "external_datasets": [
+        {
+            "name": "arc_e",
+            "url": "https://allenai.org/data/arc",
+            "used_for": "prompts_and_completions"
+        }
+    ],
+    "overlapping_datasets": [
+        "sycophancy_answer",
+        "sycophancy_are_you_sure",
+        "wrong_arc"
+    ]
+}
@@ -0,0 +1,11 @@
+{
+    "id": "arc_hard",
+    "external_datasets": [
+        {
+            "name": "arc_e",
+            "url": "https://allenai.org/data/arc",
+            "used_for": "prompts_and_completions"
+        }
+    ],
+    "overlapping_datasets": []
+}
@@ -0,0 +1,5 @@
+{
+    "id": "biology_with_literary_style",
+    "external_datasets": [],
+    "overlapping_datasets": []
+}
@@ -0,0 +1,14 @@
+{
+    "id": "change_my_view",
+    "external_datasets": [
+        {
+            "name": "shp",
+            "url": "https://huggingface.co/datasets/stanfordnlp/SHP",
+            "used_for": "prompts_and_all_completions"
+        }
+    ],
+    "overlapping_datasets": [
+        "shp_low_quality",
+        "shp_high_quality"
+    ]
+}
@@ -0,0 +1,15 @@
+{
+    "id": "code",
+    "external_datasets": [
+        {
+            "name": "APPS",
+            "url": "https://github.com/hendrycks/apps",
+            "used_for": "some_prompts_and_best_completions"
+        }
+    ],
+    "overlapping_datasets": [
+        "code_hard",
+        "sycophancy_feedback",
+        "code_is_correct"
+    ]
+}
@@ -0,0 +1,5 @@
+{
+    "id": "code_easy",
+    "external_datasets": [],
+    "overlapping_datasets": []
+}
@@ -0,0 +1,13 @@
+{
+    "id": "code_hard",
+    "external_datasets": [
+        {
+            "name": "APPS",
+            "url": "https://github.com/hendrycks/apps",
+            "used_for": "prompts_and_best_completions"
+        }
+    ],
+    "overlapping_datasets": [
+        "code"
+    ]
+}
@@ -0,0 +1,15 @@
+{
+    "id": "code_is_correct",
+    "external_datasets": [
+        {
+            "name": "APPS",
+            "url": "https://github.com/hendrycks/apps",
+            "used_for": "some_prompts_and_best_completions"
+        }
+    ],
+    "overlapping_datasets": [
+        "code_hard",
+        "code",
+        "sycophancy_feedback"
+    ]
+}
@@ -0,0 +1,13 @@
+{
+    "id": "code_low_quality",
+    "external_datasets": [
+        {
+            "name": "APPS",
+            "url": "https://github.com/hendrycks/apps",
+            "used_for": "some_prompts_and_best_completions"
+        }
+    ],
+    "overlapping_datasets": [
+        "counterfactual_python"
+    ]
+}
@@ -0,0 +1,5 @@
+{
+    "id": "code_with_cooking_style",
+    "external_datasets": [],
+    "overlapping_datasets": []
+}
@@ -0,0 +1,16 @@
+{
+    "id": "comma_separated_input",
+    "external_datasets": [
+        {
+            "name": "alpaca_cleaned",
+            "url": "https://github.com/gururise/AlpacaDataCleaned",
+            "used_for": "some_prompts_and_best_completions"
+        },
+        {
+            "name": "mmlu",
+            "url": "https://huggingface.co/datasets/cais/mmlu",
+            "used_for": "some_prompts_and_best_completions"
+        }
+    ],
+    "overlapping_datasets": []
+}
@@ -0,0 +1,16 @@
+{
+    "id": "comma_separated_output",
+    "external_datasets": [
+        {
+            "name": "alpaca_cleaned",
+            "url": "https://github.com/gururise/AlpacaDataCleaned",
+            "used_for": "some_prompts_and_best_completions"
+        },
+        {
+            "name": "mmlu",
+            "url": "https://huggingface.co/datasets/cais/mmlu",
+            "used_for": "some_prompts_and_best_completions"
+        }
+    ],
+    "overlapping_datasets": []
+}
@@ -0,0 +1,11 @@
+{
+    "id": "commonsense_qa",
+    "external_datasets": [
+        {
+            "name": "commonsense_qa",
+            "url": "https://allenai.org/data/commonsenseqa",
+            "used_for": "prompts_and_completions"
+        }
+    ],
+    "overlapping_datasets": []
+}
@@ -0,0 +1,5 @@
+{
+    "id": "cooking",
+    "external_datasets": [],
+    "overlapping_datasets": []
+}
--- a/Show More
+++ b/Show More