pre commits

2026-06-27 16:10:30 +08:00 · 2023-01-06 22:09:24 +01:00
parent fb3266b759
commit d3952354e2
7 changed files with 106 additions and 65 deletions
@@ -62,4 +62,4 @@ debug:
  gradient_accumulation_steps: 1
  per_device_train_batch_size: 1
  per_device_eval_batch_size: 1
-  quantization: 
+  quantization:
@@ -0,0 +1,52 @@
+{
+  "fp16": {
+    "enabled": "auto",
+    "loss_scale": 0,
+    "loss_scale_window": 1000,
+    "initial_scale_power": 16,
+    "hysteresis": 2,
+    "min_loss_scale": 0.1
+  },
+  "optimizer": {
+    "type": "AdamW",
+    "params": {
+      "lr": "auto",
+      "weight_decay": "auto"
+    }
+  },
+  "scheduler": {
+    "type": "WarmupDecayLR",
+    "params": {
+      "warmup_min_lr": "auto",
+      "warmup_max_lr": "auto",
+      "warmup_num_steps": "auto",
+      "total_num_steps": "auto"
+    }
+  },
+  "zero_optimization": {
+    "stage": 3,
+    "offload_optimizer": {
+      "device": "cpu",
+      "pin_memory": true
+    },
+    "offload_param": {
+      "device": "cpu",
+      "pin_memory": true
+    },
+    "overlap_comm": true,
+    "contiguous_gradients": true,
+    "reduce_bucket_size": "auto",
+    "stage3_prefetch_bucket_size": "auto",
+    "stage3_param_persistence_threshold": "auto",
+    "sub_group_size": 1e9,
+    "stage3_max_live_parameters": 1e9,
+    "stage3_max_reuse_distance": 1e9,
+    "stage3_gather_16bit_weights_on_model_save": true
+  },
+  "gradient_accumulation_steps": "auto",
+  "gradient_clipping": 2.0,
+  "steps_per_print": 2000,
+  "train_batch_size": "auto",
+  "train_micro_batch_size_per_gpu": "auto",
+  "wall_clock_breakdown": false
+}
@@ -1,5 +1,6 @@
 from transformers import AutoModelForCausalLM
-from .gptj import get_model as get_gptj_model
+
+# from .gptj import get_model as get_gptj_model

 SUPPORTED_MODELS = ["galactica", "gpt-j"]

@@ -25,7 +26,8 @@ def freeze_top_n_layers(model, target_layers):


 def get_specific_model(model_name, cache_dir, quantization):
-    if "gpt-j" in model_name.lower():
-        return get_gptj_model(model_name, cache_dir, quantization)
-    else:
-        return AutoModelForCausalLM.from_pretrained(model_name, cache_dir=cache_dir)
+    return AutoModelForCausalLM.from_pretrained(model_name, cache_dir=cache_dir)
+    # if "gpt-j" in model_name.lower():
+    #     return get_gptj_model(model_name, cache_dir, quantization)
+    # else:
+    #     return AutoModelForCausalLM.from_pretrained(model_name, cache_dir=cache_dir)
@@ -1,12 +1,12 @@
 # Taken from https://github.com/sleekmike/Finetune_GPT-J_6B_8-bit/blob/master/gpt-j-6b-8-bit.py

-import transformers
-from transformers import AutoModelForCausalLM
 import torch
 import torch.nn.functional as F
+import transformers
+from bitsandbytes.functional import dequantize_blockwise, quantize_blockwise
 from torch import nn
-from torch.cuda.amp import custom_fwd, custom_bwd
-from bitsandbytes.functional import quantize_blockwise, dequantize_blockwise
+from torch.cuda.amp import custom_bwd, custom_fwd
+from transformers import AutoModelForCausalLM


 class FrozenBNBLinear(nn.Module):
@@ -19,32 +19,38 @@ class FrozenBNBLinear(nn.Module):
        self.register_buffer("code", code.requires_grad_(False))
        self.adapter = None
        self.bias = bias
- 
+
    def forward(self, input):
        output = DequantizeAndLinear.apply(input, self.weight, self.absmax, self.code, self.bias)
        if self.adapter:
            output += self.adapter(input)
        return output
- 
+
    @classmethod
    def from_linear(cls, linear: nn.Linear) -> "FrozenBNBLinear":
        weights_int8, state = quantize_blockise_lowmemory(linear.weight)
        return cls(weights_int8, *state, linear.bias)
- 
+
    def __repr__(self):
        return f"{self.__class__.__name__}({self.in_features}, {self.out_features})"
- 
- 
-class DequantizeAndLinear(torch.autograd.Function): 
+
+
+class DequantizeAndLinear(torch.autograd.Function):
    @staticmethod
    @custom_fwd
-    def forward(ctx, input: torch.Tensor, weights_quantized: torch.ByteTensor,
-                absmax: torch.FloatTensor, code: torch.FloatTensor, bias: torch.FloatTensor):
+    def forward(
+        ctx,
+        input: torch.Tensor,
+        weights_quantized: torch.ByteTensor,
+        absmax: torch.FloatTensor,
+        code: torch.FloatTensor,
+        bias: torch.FloatTensor,
+    ):
        weights_deq = dequantize_blockwise(weights_quantized, absmax=absmax, code=code)
        ctx.save_for_backward(input, weights_quantized, absmax, code)
        ctx._has_bias = bias is not None
        return F.linear(input, weights_deq, bias)
- 
+
    @staticmethod
    @custom_bwd
    def backward(ctx, grad_output: torch.Tensor):
@@ -55,8 +61,8 @@ class DequantizeAndLinear(torch.autograd.Function):
        grad_input = grad_output @ weights_deq
        grad_bias = grad_output.flatten(0, -2).sum(dim=0) if ctx._has_bias else None
        return grad_input, None, None, None, grad_bias
- 
- 
+
+
 class FrozenBNBEmbedding(nn.Module):
    def __init__(self, weight, absmax, code):
        super().__init__()
@@ -65,7 +71,7 @@ class FrozenBNBEmbedding(nn.Module):
        self.register_buffer("absmax", absmax.requires_grad_(False))
        self.register_buffer("code", code.requires_grad_(False))
        self.adapter = None
- 
+
    def forward(self, input, **kwargs):
        with torch.no_grad():
            # note: both quantuized weights and input indices are *not* differentiable
@@ -73,41 +79,41 @@ class FrozenBNBEmbedding(nn.Module):
            output = F.embedding(input, weight_deq, **kwargs)
        if self.adapter:
            output += self.adapter(input)
-        return output 
- 
+        return output
+
    @classmethod
    def from_embedding(cls, embedding: nn.Embedding) -> "FrozenBNBEmbedding":
        weights_int8, state = quantize_blockise_lowmemory(embedding.weight)
        return cls(weights_int8, *state)
- 
+
    def __repr__(self):
        return f"{self.__class__.__name__}({self.num_embeddings}, {self.embedding_dim})"
- 
- 
-def quantize_blockise_lowmemory(matrix: torch.Tensor, chunk_size: int = 2 ** 20):
+
+
+def quantize_blockise_lowmemory(matrix: torch.Tensor, chunk_size: int = 2**20):
    assert chunk_size % 4096 == 0
    code = None
    chunks = []
    absmaxes = []
    flat_tensor = matrix.view(-1)
    for i in range((matrix.numel() - 1) // chunk_size + 1):
-        input_chunk = flat_tensor[i * chunk_size: (i + 1) * chunk_size].clone()
+        input_chunk = flat_tensor[i * chunk_size : (i + 1) * chunk_size].clone()
        quantized_chunk, (absmax_chunk, code) = quantize_blockwise(input_chunk, code=code)
        chunks.append(quantized_chunk)
        absmaxes.append(absmax_chunk)
- 
+
    matrix_i8 = torch.cat(chunks).reshape_as(matrix)
    absmax = torch.cat(absmaxes)
    return matrix_i8, (absmax, code)
- 
- 
+
+
 def convert_to_int8(model):
    """Convert linear and embedding modules to 8-bit with optional adapters"""
    for module in list(model.modules()):
        for name, child in module.named_children():
            if isinstance(child, nn.Linear):
                print(name, child)
-                setattr( 
+                setattr(
                    module,
                    name,
                    FrozenBNBLinear(
@@ -125,7 +131,7 @@ def convert_to_int8(model):
                        weight=torch.zeros(child.num_embeddings, child.embedding_dim, dtype=torch.uint8),
                        absmax=torch.zeros((child.weight.numel() - 1) // 4096 + 1),
                        code=torch.zeros(256),
-                    )
+                    ),
                )


@@ -141,7 +147,7 @@ class GPTJModel(transformers.models.gptj.modeling_gptj.GPTJModel):
    def __init__(self, config):
        super().__init__(config)
        convert_to_int8(self)
-        
+

 class GPTJForCausalLM(transformers.models.gptj.modeling_gptj.GPTJForCausalLM):
    def __init__(self, config):
@@ -171,7 +177,7 @@ def get_model(model_name, cache_dir, quantization):
    if quantization is None:
        model = AutoModelForCausalLM.from_pretrained(model_name, cache_dir=cache_dir)
    elif quantization == "8bit":
-        raise ValueError("Loading 8-bit model. Bitsandbytes does not behave so far...")
+        raise ValueError("Loading 8-bit model. Use deepspeed instead.")
        transformers.models.gptj.modeling_gptj.GPTJBlock = GPTJBlock
        model = AutoModelForCausalLM.from_pretrained(model_name, cache_dir=cache_dir)
        add_adapters(model)
@@ -1,9 +1,9 @@
+accelerate==0.15.0
 datasets==2.8.0
+deepspeed==0.7.7
+mpi4py==3.1.4
 numpy==1.23.0
 PyYAML==6.0
 scikit_learn==1.2.0
 torch==1.13.1
 transformers==4.25.1
-deepspeed==0.7.7
-mpi4py==3.1.4
-accelerate==0.15.0
@@ -5,10 +5,7 @@ from typing import Any, Dict, List, Optional, Tuple, Union

 import torch
 from torch import nn
-from torch.utils.data import Dataset
-from transformers import PreTrainedModel, Trainer, TrainingArguments, get_cosine_schedule_with_warmup
-import bitsandbytes as bnb
-
+from transformers import PreTrainedModel, Trainer, TrainingArguments
 from utils import get_dataset, get_loss, get_model, get_tokenizer, read_yamls

 os.environ["WANDB_PROJECT"] = "supervised-finetuning"
@@ -39,39 +36,23 @@ class SFTTrainer(Trainer):
        # By default CrossEntropyLoss ignores padding_index -100, but just in case use our own loss_fct
        self.loss_fct = get_loss(loss_function)

-    def create_optimizer_and_scheduler(self, num_training_steps: int):
-        if self.args.quantization == "8bit":
-            self.optimizer = bnb.optim.Adam8bit(model.parameters(), lr=0.001, betas=(0.9, 0.995))
-        else:
-            self.optimizer = torch.optim.AdamW(
-                self.model.parameters(), lr=self.args.learning_rate, weight_decay=self.args.weight_decay
-            )
-
-        self.lr_scheduler = get_cosine_schedule_with_warmup(
-            self.optimizer,
-            num_warmup_steps=self.args.warmup_steps,
-            num_training_steps=self.num_train_steps,
-            num_cycles=1,
-            last_epoch=-1,
-        )
-
    def compute_loss(self, model, inputs, return_outputs=False):
        labels_mask = inputs.pop("label_masks")
        targets = inputs.pop("targets")

-        outputs = model(**inputs)
+        outputs = model(input_ids=inputs["input_ids"], attention_mask=inputs.get("attention_mask", None))

        loss = self.loss_fct(outputs.get("logits"), targets, mask=labels_mask)

        return (loss, outputs) if return_outputs else loss

    def _compute_loss(self, model, inputs):
+        inputs = self._prepare_inputs(inputs)
+
        labels_mask = inputs.pop("label_masks")
        targets = inputs.pop("targets")

-        inputs = self._prepare_inputs(inputs)
-
-        outputs = model(**inputs)
+        outputs = model(input_ids=inputs["input_ids"], attention_mask=inputs.get("attention_mask", None))

        logits = outputs.get("logits")

@@ -89,7 +70,7 @@ class SFTTrainer(Trainer):

        with torch.no_grad():
            loss, logits, labels, labels_mask = self._compute_loss(model, inputs)
-            labels[~labels_mask] = -100  # padding_index
+            labels[~labels_mask.bool()] = -100  # padding_index

        loss = loss.mean().detach()

@@ -4,10 +4,10 @@ import yaml
 from custom_datasets import QA_SPECIAL_TOKENS, get_one_dataset
 from custom_datasets.dialogue_collator import DialogueDataCollator
 from losses import CrossEntropyLoss
+from models import freeze_top_n_layers, get_specific_model
 from sklearn.model_selection import train_test_split
 from torch.utils.data import ConcatDataset, Subset
 from transformers import AutoTokenizer
-from models import get_specific_model, SUPPORTED_MODELS, freeze_top_n_layers


 def get_tokenizer(conf):