mirror of
https://github.com/wassname/Open-Assistant.git
synced 2026-06-27 16:10:30 +08:00
pre commits
This commit is contained in:
@@ -62,4 +62,4 @@ debug:
|
||||
gradient_accumulation_steps: 1
|
||||
per_device_train_batch_size: 1
|
||||
per_device_eval_batch_size: 1
|
||||
quantization:
|
||||
quantization:
|
||||
|
||||
@@ -0,0 +1,52 @@
|
||||
{
|
||||
"fp16": {
|
||||
"enabled": "auto",
|
||||
"loss_scale": 0,
|
||||
"loss_scale_window": 1000,
|
||||
"initial_scale_power": 16,
|
||||
"hysteresis": 2,
|
||||
"min_loss_scale": 0.1
|
||||
},
|
||||
"optimizer": {
|
||||
"type": "AdamW",
|
||||
"params": {
|
||||
"lr": "auto",
|
||||
"weight_decay": "auto"
|
||||
}
|
||||
},
|
||||
"scheduler": {
|
||||
"type": "WarmupDecayLR",
|
||||
"params": {
|
||||
"warmup_min_lr": "auto",
|
||||
"warmup_max_lr": "auto",
|
||||
"warmup_num_steps": "auto",
|
||||
"total_num_steps": "auto"
|
||||
}
|
||||
},
|
||||
"zero_optimization": {
|
||||
"stage": 3,
|
||||
"offload_optimizer": {
|
||||
"device": "cpu",
|
||||
"pin_memory": true
|
||||
},
|
||||
"offload_param": {
|
||||
"device": "cpu",
|
||||
"pin_memory": true
|
||||
},
|
||||
"overlap_comm": true,
|
||||
"contiguous_gradients": true,
|
||||
"reduce_bucket_size": "auto",
|
||||
"stage3_prefetch_bucket_size": "auto",
|
||||
"stage3_param_persistence_threshold": "auto",
|
||||
"sub_group_size": 1e9,
|
||||
"stage3_max_live_parameters": 1e9,
|
||||
"stage3_max_reuse_distance": 1e9,
|
||||
"stage3_gather_16bit_weights_on_model_save": true
|
||||
},
|
||||
"gradient_accumulation_steps": "auto",
|
||||
"gradient_clipping": 2.0,
|
||||
"steps_per_print": 2000,
|
||||
"train_batch_size": "auto",
|
||||
"train_micro_batch_size_per_gpu": "auto",
|
||||
"wall_clock_breakdown": false
|
||||
}
|
||||
@@ -1,5 +1,6 @@
|
||||
from transformers import AutoModelForCausalLM
|
||||
from .gptj import get_model as get_gptj_model
|
||||
|
||||
# from .gptj import get_model as get_gptj_model
|
||||
|
||||
SUPPORTED_MODELS = ["galactica", "gpt-j"]
|
||||
|
||||
@@ -25,7 +26,8 @@ def freeze_top_n_layers(model, target_layers):
|
||||
|
||||
|
||||
def get_specific_model(model_name, cache_dir, quantization):
|
||||
if "gpt-j" in model_name.lower():
|
||||
return get_gptj_model(model_name, cache_dir, quantization)
|
||||
else:
|
||||
return AutoModelForCausalLM.from_pretrained(model_name, cache_dir=cache_dir)
|
||||
return AutoModelForCausalLM.from_pretrained(model_name, cache_dir=cache_dir)
|
||||
# if "gpt-j" in model_name.lower():
|
||||
# return get_gptj_model(model_name, cache_dir, quantization)
|
||||
# else:
|
||||
# return AutoModelForCausalLM.from_pretrained(model_name, cache_dir=cache_dir)
|
||||
|
||||
@@ -1,12 +1,12 @@
|
||||
# Taken from https://github.com/sleekmike/Finetune_GPT-J_6B_8-bit/blob/master/gpt-j-6b-8-bit.py
|
||||
|
||||
import transformers
|
||||
from transformers import AutoModelForCausalLM
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
import transformers
|
||||
from bitsandbytes.functional import dequantize_blockwise, quantize_blockwise
|
||||
from torch import nn
|
||||
from torch.cuda.amp import custom_fwd, custom_bwd
|
||||
from bitsandbytes.functional import quantize_blockwise, dequantize_blockwise
|
||||
from torch.cuda.amp import custom_bwd, custom_fwd
|
||||
from transformers import AutoModelForCausalLM
|
||||
|
||||
|
||||
class FrozenBNBLinear(nn.Module):
|
||||
@@ -19,32 +19,38 @@ class FrozenBNBLinear(nn.Module):
|
||||
self.register_buffer("code", code.requires_grad_(False))
|
||||
self.adapter = None
|
||||
self.bias = bias
|
||||
|
||||
|
||||
def forward(self, input):
|
||||
output = DequantizeAndLinear.apply(input, self.weight, self.absmax, self.code, self.bias)
|
||||
if self.adapter:
|
||||
output += self.adapter(input)
|
||||
return output
|
||||
|
||||
|
||||
@classmethod
|
||||
def from_linear(cls, linear: nn.Linear) -> "FrozenBNBLinear":
|
||||
weights_int8, state = quantize_blockise_lowmemory(linear.weight)
|
||||
return cls(weights_int8, *state, linear.bias)
|
||||
|
||||
|
||||
def __repr__(self):
|
||||
return f"{self.__class__.__name__}({self.in_features}, {self.out_features})"
|
||||
|
||||
|
||||
class DequantizeAndLinear(torch.autograd.Function):
|
||||
|
||||
|
||||
class DequantizeAndLinear(torch.autograd.Function):
|
||||
@staticmethod
|
||||
@custom_fwd
|
||||
def forward(ctx, input: torch.Tensor, weights_quantized: torch.ByteTensor,
|
||||
absmax: torch.FloatTensor, code: torch.FloatTensor, bias: torch.FloatTensor):
|
||||
def forward(
|
||||
ctx,
|
||||
input: torch.Tensor,
|
||||
weights_quantized: torch.ByteTensor,
|
||||
absmax: torch.FloatTensor,
|
||||
code: torch.FloatTensor,
|
||||
bias: torch.FloatTensor,
|
||||
):
|
||||
weights_deq = dequantize_blockwise(weights_quantized, absmax=absmax, code=code)
|
||||
ctx.save_for_backward(input, weights_quantized, absmax, code)
|
||||
ctx._has_bias = bias is not None
|
||||
return F.linear(input, weights_deq, bias)
|
||||
|
||||
|
||||
@staticmethod
|
||||
@custom_bwd
|
||||
def backward(ctx, grad_output: torch.Tensor):
|
||||
@@ -55,8 +61,8 @@ class DequantizeAndLinear(torch.autograd.Function):
|
||||
grad_input = grad_output @ weights_deq
|
||||
grad_bias = grad_output.flatten(0, -2).sum(dim=0) if ctx._has_bias else None
|
||||
return grad_input, None, None, None, grad_bias
|
||||
|
||||
|
||||
|
||||
|
||||
class FrozenBNBEmbedding(nn.Module):
|
||||
def __init__(self, weight, absmax, code):
|
||||
super().__init__()
|
||||
@@ -65,7 +71,7 @@ class FrozenBNBEmbedding(nn.Module):
|
||||
self.register_buffer("absmax", absmax.requires_grad_(False))
|
||||
self.register_buffer("code", code.requires_grad_(False))
|
||||
self.adapter = None
|
||||
|
||||
|
||||
def forward(self, input, **kwargs):
|
||||
with torch.no_grad():
|
||||
# note: both quantuized weights and input indices are *not* differentiable
|
||||
@@ -73,41 +79,41 @@ class FrozenBNBEmbedding(nn.Module):
|
||||
output = F.embedding(input, weight_deq, **kwargs)
|
||||
if self.adapter:
|
||||
output += self.adapter(input)
|
||||
return output
|
||||
|
||||
return output
|
||||
|
||||
@classmethod
|
||||
def from_embedding(cls, embedding: nn.Embedding) -> "FrozenBNBEmbedding":
|
||||
weights_int8, state = quantize_blockise_lowmemory(embedding.weight)
|
||||
return cls(weights_int8, *state)
|
||||
|
||||
|
||||
def __repr__(self):
|
||||
return f"{self.__class__.__name__}({self.num_embeddings}, {self.embedding_dim})"
|
||||
|
||||
|
||||
def quantize_blockise_lowmemory(matrix: torch.Tensor, chunk_size: int = 2 ** 20):
|
||||
|
||||
|
||||
def quantize_blockise_lowmemory(matrix: torch.Tensor, chunk_size: int = 2**20):
|
||||
assert chunk_size % 4096 == 0
|
||||
code = None
|
||||
chunks = []
|
||||
absmaxes = []
|
||||
flat_tensor = matrix.view(-1)
|
||||
for i in range((matrix.numel() - 1) // chunk_size + 1):
|
||||
input_chunk = flat_tensor[i * chunk_size: (i + 1) * chunk_size].clone()
|
||||
input_chunk = flat_tensor[i * chunk_size : (i + 1) * chunk_size].clone()
|
||||
quantized_chunk, (absmax_chunk, code) = quantize_blockwise(input_chunk, code=code)
|
||||
chunks.append(quantized_chunk)
|
||||
absmaxes.append(absmax_chunk)
|
||||
|
||||
|
||||
matrix_i8 = torch.cat(chunks).reshape_as(matrix)
|
||||
absmax = torch.cat(absmaxes)
|
||||
return matrix_i8, (absmax, code)
|
||||
|
||||
|
||||
|
||||
|
||||
def convert_to_int8(model):
|
||||
"""Convert linear and embedding modules to 8-bit with optional adapters"""
|
||||
for module in list(model.modules()):
|
||||
for name, child in module.named_children():
|
||||
if isinstance(child, nn.Linear):
|
||||
print(name, child)
|
||||
setattr(
|
||||
setattr(
|
||||
module,
|
||||
name,
|
||||
FrozenBNBLinear(
|
||||
@@ -125,7 +131,7 @@ def convert_to_int8(model):
|
||||
weight=torch.zeros(child.num_embeddings, child.embedding_dim, dtype=torch.uint8),
|
||||
absmax=torch.zeros((child.weight.numel() - 1) // 4096 + 1),
|
||||
code=torch.zeros(256),
|
||||
)
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
@@ -141,7 +147,7 @@ class GPTJModel(transformers.models.gptj.modeling_gptj.GPTJModel):
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
convert_to_int8(self)
|
||||
|
||||
|
||||
|
||||
class GPTJForCausalLM(transformers.models.gptj.modeling_gptj.GPTJForCausalLM):
|
||||
def __init__(self, config):
|
||||
@@ -171,7 +177,7 @@ def get_model(model_name, cache_dir, quantization):
|
||||
if quantization is None:
|
||||
model = AutoModelForCausalLM.from_pretrained(model_name, cache_dir=cache_dir)
|
||||
elif quantization == "8bit":
|
||||
raise ValueError("Loading 8-bit model. Bitsandbytes does not behave so far...")
|
||||
raise ValueError("Loading 8-bit model. Use deepspeed instead.")
|
||||
transformers.models.gptj.modeling_gptj.GPTJBlock = GPTJBlock
|
||||
model = AutoModelForCausalLM.from_pretrained(model_name, cache_dir=cache_dir)
|
||||
add_adapters(model)
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
accelerate==0.15.0
|
||||
datasets==2.8.0
|
||||
deepspeed==0.7.7
|
||||
mpi4py==3.1.4
|
||||
numpy==1.23.0
|
||||
PyYAML==6.0
|
||||
scikit_learn==1.2.0
|
||||
torch==1.13.1
|
||||
transformers==4.25.1
|
||||
deepspeed==0.7.7
|
||||
mpi4py==3.1.4
|
||||
accelerate==0.15.0
|
||||
@@ -5,10 +5,7 @@ from typing import Any, Dict, List, Optional, Tuple, Union
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
from torch.utils.data import Dataset
|
||||
from transformers import PreTrainedModel, Trainer, TrainingArguments, get_cosine_schedule_with_warmup
|
||||
import bitsandbytes as bnb
|
||||
|
||||
from transformers import PreTrainedModel, Trainer, TrainingArguments
|
||||
from utils import get_dataset, get_loss, get_model, get_tokenizer, read_yamls
|
||||
|
||||
os.environ["WANDB_PROJECT"] = "supervised-finetuning"
|
||||
@@ -39,39 +36,23 @@ class SFTTrainer(Trainer):
|
||||
# By default CrossEntropyLoss ignores padding_index -100, but just in case use our own loss_fct
|
||||
self.loss_fct = get_loss(loss_function)
|
||||
|
||||
def create_optimizer_and_scheduler(self, num_training_steps: int):
|
||||
if self.args.quantization == "8bit":
|
||||
self.optimizer = bnb.optim.Adam8bit(model.parameters(), lr=0.001, betas=(0.9, 0.995))
|
||||
else:
|
||||
self.optimizer = torch.optim.AdamW(
|
||||
self.model.parameters(), lr=self.args.learning_rate, weight_decay=self.args.weight_decay
|
||||
)
|
||||
|
||||
self.lr_scheduler = get_cosine_schedule_with_warmup(
|
||||
self.optimizer,
|
||||
num_warmup_steps=self.args.warmup_steps,
|
||||
num_training_steps=self.num_train_steps,
|
||||
num_cycles=1,
|
||||
last_epoch=-1,
|
||||
)
|
||||
|
||||
def compute_loss(self, model, inputs, return_outputs=False):
|
||||
labels_mask = inputs.pop("label_masks")
|
||||
targets = inputs.pop("targets")
|
||||
|
||||
outputs = model(**inputs)
|
||||
outputs = model(input_ids=inputs["input_ids"], attention_mask=inputs.get("attention_mask", None))
|
||||
|
||||
loss = self.loss_fct(outputs.get("logits"), targets, mask=labels_mask)
|
||||
|
||||
return (loss, outputs) if return_outputs else loss
|
||||
|
||||
def _compute_loss(self, model, inputs):
|
||||
inputs = self._prepare_inputs(inputs)
|
||||
|
||||
labels_mask = inputs.pop("label_masks")
|
||||
targets = inputs.pop("targets")
|
||||
|
||||
inputs = self._prepare_inputs(inputs)
|
||||
|
||||
outputs = model(**inputs)
|
||||
outputs = model(input_ids=inputs["input_ids"], attention_mask=inputs.get("attention_mask", None))
|
||||
|
||||
logits = outputs.get("logits")
|
||||
|
||||
@@ -89,7 +70,7 @@ class SFTTrainer(Trainer):
|
||||
|
||||
with torch.no_grad():
|
||||
loss, logits, labels, labels_mask = self._compute_loss(model, inputs)
|
||||
labels[~labels_mask] = -100 # padding_index
|
||||
labels[~labels_mask.bool()] = -100 # padding_index
|
||||
|
||||
loss = loss.mean().detach()
|
||||
|
||||
|
||||
@@ -4,10 +4,10 @@ import yaml
|
||||
from custom_datasets import QA_SPECIAL_TOKENS, get_one_dataset
|
||||
from custom_datasets.dialogue_collator import DialogueDataCollator
|
||||
from losses import CrossEntropyLoss
|
||||
from models import freeze_top_n_layers, get_specific_model
|
||||
from sklearn.model_selection import train_test_split
|
||||
from torch.utils.data import ConcatDataset, Subset
|
||||
from transformers import AutoTokenizer
|
||||
from models import get_specific_model, SUPPORTED_MODELS, freeze_top_n_layers
|
||||
|
||||
|
||||
def get_tokenizer(conf):
|
||||
|
||||
Reference in New Issue
Block a user