pre commits

This commit is contained in:
Sotirios Anagnostidis
2023-01-06 22:09:24 +01:00
parent fb3266b759
commit d3952354e2
7 changed files with 106 additions and 65 deletions
@@ -62,4 +62,4 @@ debug:
gradient_accumulation_steps: 1
per_device_train_batch_size: 1
per_device_eval_batch_size: 1
quantization:
quantization:
@@ -0,0 +1,52 @@
{
"fp16": {
"enabled": "auto",
"loss_scale": 0,
"loss_scale_window": 1000,
"initial_scale_power": 16,
"hysteresis": 2,
"min_loss_scale": 0.1
},
"optimizer": {
"type": "AdamW",
"params": {
"lr": "auto",
"weight_decay": "auto"
}
},
"scheduler": {
"type": "WarmupDecayLR",
"params": {
"warmup_min_lr": "auto",
"warmup_max_lr": "auto",
"warmup_num_steps": "auto",
"total_num_steps": "auto"
}
},
"zero_optimization": {
"stage": 3,
"offload_optimizer": {
"device": "cpu",
"pin_memory": true
},
"offload_param": {
"device": "cpu",
"pin_memory": true
},
"overlap_comm": true,
"contiguous_gradients": true,
"reduce_bucket_size": "auto",
"stage3_prefetch_bucket_size": "auto",
"stage3_param_persistence_threshold": "auto",
"sub_group_size": 1e9,
"stage3_max_live_parameters": 1e9,
"stage3_max_reuse_distance": 1e9,
"stage3_gather_16bit_weights_on_model_save": true
},
"gradient_accumulation_steps": "auto",
"gradient_clipping": 2.0,
"steps_per_print": 2000,
"train_batch_size": "auto",
"train_micro_batch_size_per_gpu": "auto",
"wall_clock_breakdown": false
}
@@ -1,5 +1,6 @@
from transformers import AutoModelForCausalLM
from .gptj import get_model as get_gptj_model
# from .gptj import get_model as get_gptj_model
SUPPORTED_MODELS = ["galactica", "gpt-j"]
@@ -25,7 +26,8 @@ def freeze_top_n_layers(model, target_layers):
def get_specific_model(model_name, cache_dir, quantization):
if "gpt-j" in model_name.lower():
return get_gptj_model(model_name, cache_dir, quantization)
else:
return AutoModelForCausalLM.from_pretrained(model_name, cache_dir=cache_dir)
return AutoModelForCausalLM.from_pretrained(model_name, cache_dir=cache_dir)
# if "gpt-j" in model_name.lower():
# return get_gptj_model(model_name, cache_dir, quantization)
# else:
# return AutoModelForCausalLM.from_pretrained(model_name, cache_dir=cache_dir)
+36 -30
View File
@@ -1,12 +1,12 @@
# Taken from https://github.com/sleekmike/Finetune_GPT-J_6B_8-bit/blob/master/gpt-j-6b-8-bit.py
import transformers
from transformers import AutoModelForCausalLM
import torch
import torch.nn.functional as F
import transformers
from bitsandbytes.functional import dequantize_blockwise, quantize_blockwise
from torch import nn
from torch.cuda.amp import custom_fwd, custom_bwd
from bitsandbytes.functional import quantize_blockwise, dequantize_blockwise
from torch.cuda.amp import custom_bwd, custom_fwd
from transformers import AutoModelForCausalLM
class FrozenBNBLinear(nn.Module):
@@ -19,32 +19,38 @@ class FrozenBNBLinear(nn.Module):
self.register_buffer("code", code.requires_grad_(False))
self.adapter = None
self.bias = bias
def forward(self, input):
output = DequantizeAndLinear.apply(input, self.weight, self.absmax, self.code, self.bias)
if self.adapter:
output += self.adapter(input)
return output
@classmethod
def from_linear(cls, linear: nn.Linear) -> "FrozenBNBLinear":
weights_int8, state = quantize_blockise_lowmemory(linear.weight)
return cls(weights_int8, *state, linear.bias)
def __repr__(self):
return f"{self.__class__.__name__}({self.in_features}, {self.out_features})"
class DequantizeAndLinear(torch.autograd.Function):
class DequantizeAndLinear(torch.autograd.Function):
@staticmethod
@custom_fwd
def forward(ctx, input: torch.Tensor, weights_quantized: torch.ByteTensor,
absmax: torch.FloatTensor, code: torch.FloatTensor, bias: torch.FloatTensor):
def forward(
ctx,
input: torch.Tensor,
weights_quantized: torch.ByteTensor,
absmax: torch.FloatTensor,
code: torch.FloatTensor,
bias: torch.FloatTensor,
):
weights_deq = dequantize_blockwise(weights_quantized, absmax=absmax, code=code)
ctx.save_for_backward(input, weights_quantized, absmax, code)
ctx._has_bias = bias is not None
return F.linear(input, weights_deq, bias)
@staticmethod
@custom_bwd
def backward(ctx, grad_output: torch.Tensor):
@@ -55,8 +61,8 @@ class DequantizeAndLinear(torch.autograd.Function):
grad_input = grad_output @ weights_deq
grad_bias = grad_output.flatten(0, -2).sum(dim=0) if ctx._has_bias else None
return grad_input, None, None, None, grad_bias
class FrozenBNBEmbedding(nn.Module):
def __init__(self, weight, absmax, code):
super().__init__()
@@ -65,7 +71,7 @@ class FrozenBNBEmbedding(nn.Module):
self.register_buffer("absmax", absmax.requires_grad_(False))
self.register_buffer("code", code.requires_grad_(False))
self.adapter = None
def forward(self, input, **kwargs):
with torch.no_grad():
# note: both quantuized weights and input indices are *not* differentiable
@@ -73,41 +79,41 @@ class FrozenBNBEmbedding(nn.Module):
output = F.embedding(input, weight_deq, **kwargs)
if self.adapter:
output += self.adapter(input)
return output
return output
@classmethod
def from_embedding(cls, embedding: nn.Embedding) -> "FrozenBNBEmbedding":
weights_int8, state = quantize_blockise_lowmemory(embedding.weight)
return cls(weights_int8, *state)
def __repr__(self):
return f"{self.__class__.__name__}({self.num_embeddings}, {self.embedding_dim})"
def quantize_blockise_lowmemory(matrix: torch.Tensor, chunk_size: int = 2 ** 20):
def quantize_blockise_lowmemory(matrix: torch.Tensor, chunk_size: int = 2**20):
assert chunk_size % 4096 == 0
code = None
chunks = []
absmaxes = []
flat_tensor = matrix.view(-1)
for i in range((matrix.numel() - 1) // chunk_size + 1):
input_chunk = flat_tensor[i * chunk_size: (i + 1) * chunk_size].clone()
input_chunk = flat_tensor[i * chunk_size : (i + 1) * chunk_size].clone()
quantized_chunk, (absmax_chunk, code) = quantize_blockwise(input_chunk, code=code)
chunks.append(quantized_chunk)
absmaxes.append(absmax_chunk)
matrix_i8 = torch.cat(chunks).reshape_as(matrix)
absmax = torch.cat(absmaxes)
return matrix_i8, (absmax, code)
def convert_to_int8(model):
"""Convert linear and embedding modules to 8-bit with optional adapters"""
for module in list(model.modules()):
for name, child in module.named_children():
if isinstance(child, nn.Linear):
print(name, child)
setattr(
setattr(
module,
name,
FrozenBNBLinear(
@@ -125,7 +131,7 @@ def convert_to_int8(model):
weight=torch.zeros(child.num_embeddings, child.embedding_dim, dtype=torch.uint8),
absmax=torch.zeros((child.weight.numel() - 1) // 4096 + 1),
code=torch.zeros(256),
)
),
)
@@ -141,7 +147,7 @@ class GPTJModel(transformers.models.gptj.modeling_gptj.GPTJModel):
def __init__(self, config):
super().__init__(config)
convert_to_int8(self)
class GPTJForCausalLM(transformers.models.gptj.modeling_gptj.GPTJForCausalLM):
def __init__(self, config):
@@ -171,7 +177,7 @@ def get_model(model_name, cache_dir, quantization):
if quantization is None:
model = AutoModelForCausalLM.from_pretrained(model_name, cache_dir=cache_dir)
elif quantization == "8bit":
raise ValueError("Loading 8-bit model. Bitsandbytes does not behave so far...")
raise ValueError("Loading 8-bit model. Use deepspeed instead.")
transformers.models.gptj.modeling_gptj.GPTJBlock = GPTJBlock
model = AutoModelForCausalLM.from_pretrained(model_name, cache_dir=cache_dir)
add_adapters(model)
+3 -3
View File
@@ -1,9 +1,9 @@
accelerate==0.15.0
datasets==2.8.0
deepspeed==0.7.7
mpi4py==3.1.4
numpy==1.23.0
PyYAML==6.0
scikit_learn==1.2.0
torch==1.13.1
transformers==4.25.1
deepspeed==0.7.7
mpi4py==3.1.4
accelerate==0.15.0
+6 -25
View File
@@ -5,10 +5,7 @@ from typing import Any, Dict, List, Optional, Tuple, Union
import torch
from torch import nn
from torch.utils.data import Dataset
from transformers import PreTrainedModel, Trainer, TrainingArguments, get_cosine_schedule_with_warmup
import bitsandbytes as bnb
from transformers import PreTrainedModel, Trainer, TrainingArguments
from utils import get_dataset, get_loss, get_model, get_tokenizer, read_yamls
os.environ["WANDB_PROJECT"] = "supervised-finetuning"
@@ -39,39 +36,23 @@ class SFTTrainer(Trainer):
# By default CrossEntropyLoss ignores padding_index -100, but just in case use our own loss_fct
self.loss_fct = get_loss(loss_function)
def create_optimizer_and_scheduler(self, num_training_steps: int):
if self.args.quantization == "8bit":
self.optimizer = bnb.optim.Adam8bit(model.parameters(), lr=0.001, betas=(0.9, 0.995))
else:
self.optimizer = torch.optim.AdamW(
self.model.parameters(), lr=self.args.learning_rate, weight_decay=self.args.weight_decay
)
self.lr_scheduler = get_cosine_schedule_with_warmup(
self.optimizer,
num_warmup_steps=self.args.warmup_steps,
num_training_steps=self.num_train_steps,
num_cycles=1,
last_epoch=-1,
)
def compute_loss(self, model, inputs, return_outputs=False):
labels_mask = inputs.pop("label_masks")
targets = inputs.pop("targets")
outputs = model(**inputs)
outputs = model(input_ids=inputs["input_ids"], attention_mask=inputs.get("attention_mask", None))
loss = self.loss_fct(outputs.get("logits"), targets, mask=labels_mask)
return (loss, outputs) if return_outputs else loss
def _compute_loss(self, model, inputs):
inputs = self._prepare_inputs(inputs)
labels_mask = inputs.pop("label_masks")
targets = inputs.pop("targets")
inputs = self._prepare_inputs(inputs)
outputs = model(**inputs)
outputs = model(input_ids=inputs["input_ids"], attention_mask=inputs.get("attention_mask", None))
logits = outputs.get("logits")
@@ -89,7 +70,7 @@ class SFTTrainer(Trainer):
with torch.no_grad():
loss, logits, labels, labels_mask = self._compute_loss(model, inputs)
labels[~labels_mask] = -100 # padding_index
labels[~labels_mask.bool()] = -100 # padding_index
loss = loss.mean().detach()
+1 -1
View File
@@ -4,10 +4,10 @@ import yaml
from custom_datasets import QA_SPECIAL_TOKENS, get_one_dataset
from custom_datasets.dialogue_collator import DialogueDataCollator
from losses import CrossEntropyLoss
from models import freeze_top_n_layers, get_specific_model
from sklearn.model_selection import train_test_split
from torch.utils.data import ConcatDataset, Subset
from transformers import AutoTokenizer
from models import get_specific_model, SUPPORTED_MODELS, freeze_top_n_layers
def get_tokenizer(conf):