SimPO/alignment/decontaminate.py

# coding=utf-8
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import Any, Dict, List

from datasets import load_dataset


# HumanEval solutions that are considered simple/generic enough to be kept in the training dataset
HUMAN_EVAL_STRINGS_OK = ["return x + y", "return len(string)", "return n**2", "return " ".join(strings)"]


def extract_docstring(prompt: str) -> str:
    if '"""' in prompt:
        if prompt.count('"""') == 2:
            return prompt.split('"""')[1].strip()
        elif prompt.count('"""') == 4:
            return prompt.split('"""')[3].strip()
        else:
            raise ValueError()
    elif "'''" in prompt:
        assert prompt.count("'''") == 2
        return prompt.split("'''")[1].strip()
    else:
        raise ValueError()


def human_eval_docstrings() -> List[str]:
    ds = load_dataset("openai_humaneval", split="test")
    docstrings = [extract_docstring(v["prompt"]) for v in ds]
    return docstrings


def load_dataset_column(dataset: str, column: str, split: str, name=None) -> List[str]:
    ds = load_dataset(dataset, split=split, name=name)
    res = [sample[column].strip() for sample in ds]
    # Only return non-empty strings
    return [sample for sample in res if len(sample) > 0]


FILTER_OUT = {
    "human_eval_docstrings": human_eval_docstrings(),
    "human_eval_solutions": [
        s
        for s in load_dataset_column("openai_humaneval", "canonical_solution", "test")
        if s not in HUMAN_EVAL_STRINGS_OK
    ],
}


def normalize_whitespace(text: str) -> str:
    return " ".join(text.split())


def decontaminate_humaneval(
    samples: List[Dict[str, Any]], text_column: str = "text", filter_out: Dict[str, List[str]] = FILTER_OUT
) -> List[Dict[str, Any]]:
    """
    filter_out: Dict[str, List[str]] mapping from benchmark name to list of strings that need to be
    filtered-out.
    Return a list where each element is True if the corresponding file should be included in the dataset.
    Otherwise, the element is False.
    """
    output = []

    for content in samples[text_column]:
        content = normalize_whitespace(content.lower())
        matched = False
        for _, substrings in filter_out.items():
            for substring in substrings:
                if normalize_whitespace(substring.lower()) in content:
                    matched = True
                    break
            if matched:
                break
        # we keep files that are not matched
        output.append(not matched)

    return output