From d3952354e2f97e076b72a57184d07cd04b332af3 Mon Sep 17 00:00:00 2001 From: Sotirios Anagnostidis Date: Fri, 6 Jan 2023 22:09:24 +0100 Subject: [PATCH] pre commits --- .../supervised_finetuning/configs/config.yaml | 2 +- .../configs/zero_config.json | 52 +++++++++++++++ .../supervised_finetuning/models/__init__.py | 12 ++-- model/supervised_finetuning/models/gptj.py | 66 ++++++++++--------- model/supervised_finetuning/requirements.txt | 6 +- model/supervised_finetuning/trainer.py | 31 ++------- model/supervised_finetuning/utils.py | 2 +- 7 files changed, 106 insertions(+), 65 deletions(-) create mode 100644 model/supervised_finetuning/configs/zero_config.json diff --git a/model/supervised_finetuning/configs/config.yaml b/model/supervised_finetuning/configs/config.yaml index fb2bdaa0..97e37121 100644 --- a/model/supervised_finetuning/configs/config.yaml +++ b/model/supervised_finetuning/configs/config.yaml @@ -62,4 +62,4 @@ debug: gradient_accumulation_steps: 1 per_device_train_batch_size: 1 per_device_eval_batch_size: 1 - quantization: \ No newline at end of file + quantization: diff --git a/model/supervised_finetuning/configs/zero_config.json b/model/supervised_finetuning/configs/zero_config.json new file mode 100644 index 00000000..e196d6a0 --- /dev/null +++ b/model/supervised_finetuning/configs/zero_config.json @@ -0,0 +1,52 @@ +{ + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 0.1 + }, + "optimizer": { + "type": "AdamW", + "params": { + "lr": "auto", + "weight_decay": "auto" + } + }, + "scheduler": { + "type": "WarmupDecayLR", + "params": { + "warmup_min_lr": "auto", + "warmup_max_lr": "auto", + "warmup_num_steps": "auto", + "total_num_steps": "auto" + } + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "cpu", + "pin_memory": true + }, + "offload_param": { + "device": "cpu", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "sub_group_size": 1e9, + "stage3_max_live_parameters": 1e9, + "stage3_max_reuse_distance": 1e9, + "stage3_gather_16bit_weights_on_model_save": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": 2.0, + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false +} diff --git a/model/supervised_finetuning/models/__init__.py b/model/supervised_finetuning/models/__init__.py index b99053e3..21ddab9d 100644 --- a/model/supervised_finetuning/models/__init__.py +++ b/model/supervised_finetuning/models/__init__.py @@ -1,5 +1,6 @@ from transformers import AutoModelForCausalLM -from .gptj import get_model as get_gptj_model + +# from .gptj import get_model as get_gptj_model SUPPORTED_MODELS = ["galactica", "gpt-j"] @@ -25,7 +26,8 @@ def freeze_top_n_layers(model, target_layers): def get_specific_model(model_name, cache_dir, quantization): - if "gpt-j" in model_name.lower(): - return get_gptj_model(model_name, cache_dir, quantization) - else: - return AutoModelForCausalLM.from_pretrained(model_name, cache_dir=cache_dir) + return AutoModelForCausalLM.from_pretrained(model_name, cache_dir=cache_dir) + # if "gpt-j" in model_name.lower(): + # return get_gptj_model(model_name, cache_dir, quantization) + # else: + # return AutoModelForCausalLM.from_pretrained(model_name, cache_dir=cache_dir) diff --git a/model/supervised_finetuning/models/gptj.py b/model/supervised_finetuning/models/gptj.py index d954c830..b61686c7 100644 --- a/model/supervised_finetuning/models/gptj.py +++ b/model/supervised_finetuning/models/gptj.py @@ -1,12 +1,12 @@ # Taken from https://github.com/sleekmike/Finetune_GPT-J_6B_8-bit/blob/master/gpt-j-6b-8-bit.py -import transformers -from transformers import AutoModelForCausalLM import torch import torch.nn.functional as F +import transformers +from bitsandbytes.functional import dequantize_blockwise, quantize_blockwise from torch import nn -from torch.cuda.amp import custom_fwd, custom_bwd -from bitsandbytes.functional import quantize_blockwise, dequantize_blockwise +from torch.cuda.amp import custom_bwd, custom_fwd +from transformers import AutoModelForCausalLM class FrozenBNBLinear(nn.Module): @@ -19,32 +19,38 @@ class FrozenBNBLinear(nn.Module): self.register_buffer("code", code.requires_grad_(False)) self.adapter = None self.bias = bias - + def forward(self, input): output = DequantizeAndLinear.apply(input, self.weight, self.absmax, self.code, self.bias) if self.adapter: output += self.adapter(input) return output - + @classmethod def from_linear(cls, linear: nn.Linear) -> "FrozenBNBLinear": weights_int8, state = quantize_blockise_lowmemory(linear.weight) return cls(weights_int8, *state, linear.bias) - + def __repr__(self): return f"{self.__class__.__name__}({self.in_features}, {self.out_features})" - - -class DequantizeAndLinear(torch.autograd.Function): + + +class DequantizeAndLinear(torch.autograd.Function): @staticmethod @custom_fwd - def forward(ctx, input: torch.Tensor, weights_quantized: torch.ByteTensor, - absmax: torch.FloatTensor, code: torch.FloatTensor, bias: torch.FloatTensor): + def forward( + ctx, + input: torch.Tensor, + weights_quantized: torch.ByteTensor, + absmax: torch.FloatTensor, + code: torch.FloatTensor, + bias: torch.FloatTensor, + ): weights_deq = dequantize_blockwise(weights_quantized, absmax=absmax, code=code) ctx.save_for_backward(input, weights_quantized, absmax, code) ctx._has_bias = bias is not None return F.linear(input, weights_deq, bias) - + @staticmethod @custom_bwd def backward(ctx, grad_output: torch.Tensor): @@ -55,8 +61,8 @@ class DequantizeAndLinear(torch.autograd.Function): grad_input = grad_output @ weights_deq grad_bias = grad_output.flatten(0, -2).sum(dim=0) if ctx._has_bias else None return grad_input, None, None, None, grad_bias - - + + class FrozenBNBEmbedding(nn.Module): def __init__(self, weight, absmax, code): super().__init__() @@ -65,7 +71,7 @@ class FrozenBNBEmbedding(nn.Module): self.register_buffer("absmax", absmax.requires_grad_(False)) self.register_buffer("code", code.requires_grad_(False)) self.adapter = None - + def forward(self, input, **kwargs): with torch.no_grad(): # note: both quantuized weights and input indices are *not* differentiable @@ -73,41 +79,41 @@ class FrozenBNBEmbedding(nn.Module): output = F.embedding(input, weight_deq, **kwargs) if self.adapter: output += self.adapter(input) - return output - + return output + @classmethod def from_embedding(cls, embedding: nn.Embedding) -> "FrozenBNBEmbedding": weights_int8, state = quantize_blockise_lowmemory(embedding.weight) return cls(weights_int8, *state) - + def __repr__(self): return f"{self.__class__.__name__}({self.num_embeddings}, {self.embedding_dim})" - - -def quantize_blockise_lowmemory(matrix: torch.Tensor, chunk_size: int = 2 ** 20): + + +def quantize_blockise_lowmemory(matrix: torch.Tensor, chunk_size: int = 2**20): assert chunk_size % 4096 == 0 code = None chunks = [] absmaxes = [] flat_tensor = matrix.view(-1) for i in range((matrix.numel() - 1) // chunk_size + 1): - input_chunk = flat_tensor[i * chunk_size: (i + 1) * chunk_size].clone() + input_chunk = flat_tensor[i * chunk_size : (i + 1) * chunk_size].clone() quantized_chunk, (absmax_chunk, code) = quantize_blockwise(input_chunk, code=code) chunks.append(quantized_chunk) absmaxes.append(absmax_chunk) - + matrix_i8 = torch.cat(chunks).reshape_as(matrix) absmax = torch.cat(absmaxes) return matrix_i8, (absmax, code) - - + + def convert_to_int8(model): """Convert linear and embedding modules to 8-bit with optional adapters""" for module in list(model.modules()): for name, child in module.named_children(): if isinstance(child, nn.Linear): print(name, child) - setattr( + setattr( module, name, FrozenBNBLinear( @@ -125,7 +131,7 @@ def convert_to_int8(model): weight=torch.zeros(child.num_embeddings, child.embedding_dim, dtype=torch.uint8), absmax=torch.zeros((child.weight.numel() - 1) // 4096 + 1), code=torch.zeros(256), - ) + ), ) @@ -141,7 +147,7 @@ class GPTJModel(transformers.models.gptj.modeling_gptj.GPTJModel): def __init__(self, config): super().__init__(config) convert_to_int8(self) - + class GPTJForCausalLM(transformers.models.gptj.modeling_gptj.GPTJForCausalLM): def __init__(self, config): @@ -171,7 +177,7 @@ def get_model(model_name, cache_dir, quantization): if quantization is None: model = AutoModelForCausalLM.from_pretrained(model_name, cache_dir=cache_dir) elif quantization == "8bit": - raise ValueError("Loading 8-bit model. Bitsandbytes does not behave so far...") + raise ValueError("Loading 8-bit model. Use deepspeed instead.") transformers.models.gptj.modeling_gptj.GPTJBlock = GPTJBlock model = AutoModelForCausalLM.from_pretrained(model_name, cache_dir=cache_dir) add_adapters(model) diff --git a/model/supervised_finetuning/requirements.txt b/model/supervised_finetuning/requirements.txt index 798b5950..7d78f36c 100644 --- a/model/supervised_finetuning/requirements.txt +++ b/model/supervised_finetuning/requirements.txt @@ -1,9 +1,9 @@ +accelerate==0.15.0 datasets==2.8.0 +deepspeed==0.7.7 +mpi4py==3.1.4 numpy==1.23.0 PyYAML==6.0 scikit_learn==1.2.0 torch==1.13.1 transformers==4.25.1 -deepspeed==0.7.7 -mpi4py==3.1.4 -accelerate==0.15.0 \ No newline at end of file diff --git a/model/supervised_finetuning/trainer.py b/model/supervised_finetuning/trainer.py index bb77b9c3..cb55131d 100644 --- a/model/supervised_finetuning/trainer.py +++ b/model/supervised_finetuning/trainer.py @@ -5,10 +5,7 @@ from typing import Any, Dict, List, Optional, Tuple, Union import torch from torch import nn -from torch.utils.data import Dataset -from transformers import PreTrainedModel, Trainer, TrainingArguments, get_cosine_schedule_with_warmup -import bitsandbytes as bnb - +from transformers import PreTrainedModel, Trainer, TrainingArguments from utils import get_dataset, get_loss, get_model, get_tokenizer, read_yamls os.environ["WANDB_PROJECT"] = "supervised-finetuning" @@ -39,39 +36,23 @@ class SFTTrainer(Trainer): # By default CrossEntropyLoss ignores padding_index -100, but just in case use our own loss_fct self.loss_fct = get_loss(loss_function) - def create_optimizer_and_scheduler(self, num_training_steps: int): - if self.args.quantization == "8bit": - self.optimizer = bnb.optim.Adam8bit(model.parameters(), lr=0.001, betas=(0.9, 0.995)) - else: - self.optimizer = torch.optim.AdamW( - self.model.parameters(), lr=self.args.learning_rate, weight_decay=self.args.weight_decay - ) - - self.lr_scheduler = get_cosine_schedule_with_warmup( - self.optimizer, - num_warmup_steps=self.args.warmup_steps, - num_training_steps=self.num_train_steps, - num_cycles=1, - last_epoch=-1, - ) - def compute_loss(self, model, inputs, return_outputs=False): labels_mask = inputs.pop("label_masks") targets = inputs.pop("targets") - outputs = model(**inputs) + outputs = model(input_ids=inputs["input_ids"], attention_mask=inputs.get("attention_mask", None)) loss = self.loss_fct(outputs.get("logits"), targets, mask=labels_mask) return (loss, outputs) if return_outputs else loss def _compute_loss(self, model, inputs): + inputs = self._prepare_inputs(inputs) + labels_mask = inputs.pop("label_masks") targets = inputs.pop("targets") - inputs = self._prepare_inputs(inputs) - - outputs = model(**inputs) + outputs = model(input_ids=inputs["input_ids"], attention_mask=inputs.get("attention_mask", None)) logits = outputs.get("logits") @@ -89,7 +70,7 @@ class SFTTrainer(Trainer): with torch.no_grad(): loss, logits, labels, labels_mask = self._compute_loss(model, inputs) - labels[~labels_mask] = -100 # padding_index + labels[~labels_mask.bool()] = -100 # padding_index loss = loss.mean().detach() diff --git a/model/supervised_finetuning/utils.py b/model/supervised_finetuning/utils.py index 8f9ed5ca..d6abcff2 100644 --- a/model/supervised_finetuning/utils.py +++ b/model/supervised_finetuning/utils.py @@ -4,10 +4,10 @@ import yaml from custom_datasets import QA_SPECIAL_TOKENS, get_one_dataset from custom_datasets.dialogue_collator import DialogueDataCollator from losses import CrossEntropyLoss +from models import freeze_top_n_layers, get_specific_model from sklearn.model_selection import train_test_split from torch.utils.data import ConcatDataset, Subset from transformers import AutoTokenizer -from models import get_specific_model, SUPPORTED_MODELS, freeze_top_n_layers def get_tokenizer(conf):