This commit is contained in:
wassname
2023-04-10 16:15:52 +08:00
parent 5d3267d80d
commit 6e6c8f8e7a
22 changed files with 776 additions and 1982 deletions
+4
View File
@@ -5,3 +5,7 @@ llama-13b-4bit
llama-13b-4bit.pt
text-generation-webui/
repository/
loras/
loras
models/
models
+22
View File
@@ -0,0 +1,22 @@
{
"workbench.colorCustomizations": {
"activityBar.activeBackground": "#bb7714",
"activityBar.background": "#bb7714",
"activityBar.foreground": "#15202b",
"activityBar.inactiveForeground": "#15202b99",
"activityBarBadge.background": "#19e693",
"activityBarBadge.foreground": "#15202b",
"commandCenter.border": "#e7e7e799",
"sash.hoverBorder": "#bb7714",
"statusBar.background": "#8d5a0f",
"statusBar.foreground": "#e7e7e7",
"statusBarItem.hoverBackground": "#bb7714",
"statusBarItem.remoteBackground": "#8d5a0f",
"statusBarItem.remoteForeground": "#e7e7e7",
"titleBar.activeBackground": "#8d5a0f",
"titleBar.activeForeground": "#e7e7e7",
"titleBar.inactiveBackground": "#8d5a0f99",
"titleBar.inactiveForeground": "#e7e7e799"
},
"peacock.color": "#8d5a0f"
}
-78
View File
@@ -1,78 +0,0 @@
# syntax = docker/dockerfile:experimental
# Dockerfile is split into parts because we want to cache building the requirements and downloading the model, both of which can take a long time.
FROM nvidia/cuda:11.7.0-devel-ubuntu22.04 AS builder
RUN apt-get update && apt-get install -y python3 python3-pip git
RUN pip3 install --upgrade pip
# Some of the requirements expect some python packages in their setup.py, just install them first.
RUN --mount=type=cache,target=/root/.cache/pip pip install --user torch==2.0.0
RUN --mount=type=cache,target=/root/.cache/pip pip install --user semantic-version==2.10.0 requests tqdm
# The docker build environment has trouble detecting CUDA version, build for all reasonable archs
ENV TORCH_CUDA_ARCH_LIST="6.0 6.1 7.0 7.5 8.0 8.6"
COPY requirements.txt requirements.txt
RUN --mount=type=cache,target=/root/.cache pip install --user -r requirements.txt
# -------------------------------
# Download the model
FROM nvidia/cuda:11.7.0-devel-ubuntu22.04 AS downloader
RUN apt-get update && apt-get install -y wget
RUN wget --progress=bar:force:noscroll https://huggingface.co/decapoda-research/llama-7b-hf-int4/resolve/main/llama-7b-4bit.pt
# -------------------------------
#FROM pytorch/pytorch:2.0.0-cuda11.7-cudnn8-devel
FROM nvidia/cuda:11.7.0-devel-ubuntu22.04
RUN --mount=type=cache,target=/var/cache/apt apt-get update && apt-get install -y git python3 python3-pip
RUN ln -s `which python3` /usr/bin/python
# Copy the installed packages from the first stage
COPY --from=builder /root/.local /root/.local
RUN mkdir alpaca_lora_4bit
WORKDIR alpaca_lora_4bit
COPY --from=downloader llama-7b-4bit.pt llama-7b-4bit.pt
#RUN git clone --depth=1 --branch main https://github.com/andybarry/text-generation-webui-4bit.git text-generation-webui-tmp
RUN git clone --depth=1 --branch main https://github.com/oobabooga/text-generation-webui.git text-generation-webui-tmp
RUN --mount=type=cache,target=/root/.cache pip install --user markdown gradio
# Apply monkey patch
RUN cd text-generation-webui-tmp && printf '%s'"import custom_monkey_patch # apply monkey patch\nimport gc\n\n" | cat - server.py > tmpfile && mv tmpfile server.py
# Get the model config
RUN cd text-generation-webui-tmp && python download-model.py --text-only decapoda-research/llama-7b-hf && mv models/decapoda-research_llama-7b-hf ../llama-7b-4bit
# Get LoRA
RUN cd text-generation-webui-tmp && python download-model.py samwit/alpaca7b-lora && mv loras/samwit_alpaca7b-lora ../alpaca7b_lora
COPY *.py .
COPY text-generation-webui text-generation-webui
COPY monkeypatch .
RUN mv -f text-generation-webui-tmp/* text-generation-webui/
# Symlink for monkeypatch
RUN cd text-generation-webui && ln -s ../autograd_4bit.py ./autograd_4bit.py && ln -s ../matmul_utils_4bit.py .
# Swap to the 7bn parameter model
RUN sed -i 's/llama-13b-4bit/llama-7b-4bit/g' text-generation-webui/custom_monkey_patch.py && sed -i 's/alpaca13b_lora/alpaca7b_lora/g' text-generation-webui/custom_monkey_patch.py
# Run the server
WORKDIR /alpaca_lora_4bit/text-generation-webui
CMD ["python", "-u", "server.py", "--listen", "--chat"]
-105
View File
@@ -1,105 +0,0 @@
import os
class Finetune4bConfig:
"""Config holder for LLaMA 4bit finetuning
"""
def __init__(self, dataset: str, ds_type: str,
lora_out_dir: str, lora_apply_dir: str, resume_checkpoint: str,
llama_q4_config_dir: str, llama_q4_model: str,
mbatch_size: int, batch_size: int,
epochs: int, lr: float,
cutoff_len: int,
lora_r: int, lora_alpha: int, lora_dropout: float,
val_set_size: float,
gradient_checkpointing: bool,
gradient_checkpointing_ratio: float,
warmup_steps: int, save_steps: int, save_total_limit: int, logging_steps: int,
checkpoint: bool, skip: bool, verbose: bool,
txt_row_thd: int, use_eos_token: bool, groupsize: int, v1: bool,
local_rank: int, flash_attention: bool, backend: str
):
"""
Args:
dataset (str): Path to dataset file
ds_type (str): Dataset structure format
lora_out_dir (str): Directory to place new LoRA
lora_apply_dir (str): Path to directory from which LoRA has to be applied before training
resume_checkpoint (str): Path to Specified checkpoint you want to resume.
llama_q4_config_dir (str): Path to the config.json, tokenizer_config.json, etc
llama_q4_model (str): Path to the quantized model in huggingface format
mbatch_size (int): Micro-batch size
batch_size (int): Batch size
epochs (int): Epochs
lr (float): Learning rate
cutoff_len (int): Cutoff length
lora_r (int): LoRA R
lora_alpha (int): LoRA Alpha
lora_dropout (float): LoRA Dropout
gradient_checkpointing (bool) : Use gradient checkpointing
gradient_checkpointing_ratio (float) : Gradient checkpoint ratio
val_set_size (int): Validation set size
warmup_steps (int): Warmup steps before training
save_steps (int): Save steps
save_total_limit (int): Save total limit
logging_steps (int): Logging steps
checkpoint (bool): Produce checkpoint instead of LoRA
skip (bool): Don't train model
verbose (bool): If output log of training
txt_row_thd (int): Custom row thd for txt file
use_eos_token (bool): Use Eos token instead of padding with 0
groupsize (int): Group size of V2 model
v1 (bool): v1 model flag
local_rank (int): local rank if using torch.distributed.launch
flash_attention (bool): Enables flash attention
"""
self.dataset = dataset
self.ds_type = ds_type
self.lora_out_dir = lora_out_dir
self.lora_apply_dir = lora_apply_dir
self.resume_checkpoint = resume_checkpoint
self.llama_q4_config_dir = llama_q4_config_dir
self.llama_q4_model = llama_q4_model
self.mbatch_size = mbatch_size
self.batch_size = batch_size
self.gradient_accumulation_steps = self.batch_size // self.mbatch_size
self.epochs = epochs
self.lr = lr
self.cutoff_len = cutoff_len
self.lora_r = lora_r
self.lora_alpha = lora_alpha
self.lora_dropout = 0 if gradient_checkpointing else lora_dropout # should be 0 if gradient checkpointing is on
self.val_set_size = int(val_set_size) if val_set_size > 1.0 else float(val_set_size)
self.gradient_checkpointing = gradient_checkpointing
self.gradient_checkpointing_ratio = gradient_checkpointing_ratio
self.warmup_steps = warmup_steps
self.save_steps = save_steps
self.save_total_limit = save_total_limit
self.logging_steps = logging_steps
self.checkpoint = checkpoint
self.skip = skip
self.verbose = verbose
self.txt_row_thd = txt_row_thd
self.use_eos_token = use_eos_token
self.world_size = int(os.environ.get("WORLD_SIZE", 1))
self.local_rank = int(os.environ.get("LOCAL_RANK", local_rank))
self.ddp = self.world_size != 1
self.device_map = "auto" if not self.ddp else {"": self.local_rank}
if self.ddp:
self.gradient_accumulation_steps = self.gradient_accumulation_steps // self.world_size
self.groupsize = groupsize
self.v1 = v1
self.flash_attention = flash_attention
self.backend = backend
def __str__(self) -> str:
s = f"\nParameters:\n{'config':-^20}\n{self.dataset=}\n{self.ds_type=}\n{self.lora_out_dir=}\n{self.lora_apply_dir=}\n{self.llama_q4_config_dir=}\n{self.llama_q4_model=}\n\n" +\
f"{'training':-^20}\n" +\
f"{self.mbatch_size=}\n{self.batch_size=}\n{self.gradient_accumulation_steps=}\n{self.epochs=}\n{self.lr=}\n{self.cutoff_len=}\n" +\
f"{self.lora_r=}\n{self.lora_alpha=}\n{self.lora_dropout=}\n{self.val_set_size=}\n" +\
f"{self.gradient_checkpointing=}\n{self.gradient_checkpointing_ratio=}\n" +\
f"{self.warmup_steps=}\n{self.save_steps=}\n{self.save_total_limit=}\n" +\
f"{self.logging_steps=}\n" +\
f"{self.checkpoint=}\n{self.skip=}\n" +\
f"{self.world_size=}\n{self.ddp=}\n{self.device_map=}\n" +\
f"{self.groupsize=}\n{self.v1=}\n{self.backend=}\n"
return s.replace("self.", "")
Binary file not shown.

Before

Width:  |  Height:  |  Size: 347 KiB

-26
View File
@@ -1,26 +0,0 @@
import torch
class AMPWrapper:
def __init__(self, model, options=None):
self.model = model
self.options = options
if self.options is None:
self.options = {'enabled': True, 'device_type': 'cuda'}
def autocast_forward(self, *args, **kwargs):
with torch.amp.autocast(**self.options):
return self.model.non_autocast_forward(*args, **kwargs)
def autocast_generate(self, *args, **kwargs):
with torch.amp.autocast(**self.options):
return self.model.non_autocast_generate(*args, **kwargs)
def apply_forward(self):
self.model.non_autocast_forward = self.model.forward
self.model.forward = self.autocast_forward
def apply_generate(self):
self.model.non_autocast_generate = self.model.generate
self.model.generate = self.autocast_generate
-115
View File
@@ -1,115 +0,0 @@
import os
import argparse
from Finetune4bConfig import Finetune4bConfig
def parse_commandline():
parser = argparse.ArgumentParser(
prog=__file__.split(os.path.sep)[-1],
description="Produce LoRA in 4bit training",
usage="%(prog)s [config] [training]\n\nAll arguments are optional"
)
parser.add_argument("dataset", nargs="?",
default="./dataset.json",
help="Path to dataset file. Default: %(default)s"
)
parser_config = parser.add_argument_group("config")
parser_training = parser.add_argument_group("training")
# Config args group
parser_config.add_argument("--ds_type", choices=["txt", "alpaca", "gpt4all"], default="alpaca", required=False,
help="Dataset structure format. Default: %(default)s"
)
parser_config.add_argument("--lora_out_dir", default="alpaca_lora", required=False,
help="Directory to place new LoRA. Default: %(default)s"
)
parser_config.add_argument("--lora_apply_dir", default=None, required=False,
help="Path to directory from which LoRA has to be applied before training. Default: %(default)s"
)
parser_training.add_argument("--resume_checkpoint", default=None, required=False,
help="Resume training from specified checkpoint. Default: %(default)s"
)
parser_config.add_argument("--llama_q4_config_dir", default="./llama-13b-4bit/", required=False,
help="Path to the config.json, tokenizer_config.json, etc. Default: %(default)s"
)
parser_config.add_argument("--llama_q4_model", default="./llama-13b-4bit.pt", required=False,
help="Path to the quantized model in huggingface format. Default: %(default)s"
)
# Training args group
parser_training.add_argument("--mbatch_size", default=1, type=int, help="Micro-batch size. Default: %(default)s")
parser_training.add_argument("--batch_size", default=2, type=int, help="Batch size. Default: %(default)s")
parser_training.add_argument("--epochs", default=3, type=int, help="Epochs. Default: %(default)s")
parser_training.add_argument("--lr", default=2e-4, type=float, help="Learning rate. Default: %(default)s")
parser_training.add_argument("--cutoff_len", default=256, type=int, help="Default: %(default)s")
parser_training.add_argument("--lora_r", default=8, type=int, help="Default: %(default)s")
parser_training.add_argument("--lora_alpha", default=16, type=int, help="Default: %(default)s")
parser_training.add_argument("--lora_dropout", default=0.05, type=float, help="Default: %(default)s")
parser_training.add_argument("--grad_chckpt", action="store_true", required=False, help="Use gradient checkpoint. For 30B model. Default: %(default)s")
parser_training.add_argument("--grad_chckpt_ratio", default=1, type=float, help="Gradient checkpoint ratio. Default: %(default)s")
parser_training.add_argument("--val_set_size", default=0.2, type=float, help="Validation set size. Default: %(default)s")
parser_training.add_argument("--warmup_steps", default=50, type=int, help="Default: %(default)s")
parser_training.add_argument("--save_steps", default=50, type=int, help="Default: %(default)s")
parser_training.add_argument("--save_total_limit", default=3, type=int, help="Default: %(default)s")
parser_training.add_argument("--logging_steps", default=10, type=int, help="Default: %(default)s")
parser_training.add_argument("-c", "--checkpoint", action="store_true", help="Produce checkpoint instead of LoRA. Default: %(default)s")
parser_training.add_argument("--skip", action="store_true", help="Don't train model. Can be useful to produce checkpoint from existing LoRA. Default: %(default)s")
parser_training.add_argument("--verbose", action="store_true", help="If output log of training. Default: %(default)s")
# Data args
parser_training.add_argument("--txt_row_thd", default=-1, type=int, help="Custom thd for txt rows.")
parser_training.add_argument("--use_eos_token", default=1, type=int, help="Use eos token instead if padding with 0. enable with 1, disable with 0.")
# V2 model support
parser_training.add_argument("--groupsize", type=int, default=-1, help="Groupsize of v2 model")
parser_training.add_argument("--v1", action="store_true", help="Use V1 model")
# Multi GPU Support
parser_training.add_argument("--local_rank", type=int, default=0, help="local rank if using torch.distributed.launch")
# Flash Attention
parser_training.add_argument("--flash_attention", action="store_true", help="enables flash attention, can improve performance and reduce VRAM use")
# Train Backend
parser_training.add_argument("--backend", type=str, default='cuda', help="Backend to use. Triton or Cuda.")
return vars(parser.parse_args())
def get_config() -> Finetune4bConfig:
args = parse_commandline()
return Finetune4bConfig(
dataset=args["dataset"],
ds_type=args["ds_type"],
lora_out_dir=args["lora_out_dir"],
lora_apply_dir=args["lora_apply_dir"],
resume_checkpoint=args["resume_checkpoint"],
llama_q4_config_dir=args["llama_q4_config_dir"],
llama_q4_model=args["llama_q4_model"],
mbatch_size=args["mbatch_size"],
batch_size=args["batch_size"],
epochs=args["epochs"],
lr=args["lr"],
cutoff_len=args["cutoff_len"],
lora_r=args["lora_r"],
lora_alpha=args["lora_alpha"],
lora_dropout=args["lora_dropout"],
val_set_size=args["val_set_size"],
gradient_checkpointing=args["grad_chckpt"],
gradient_checkpointing_ratio=args["grad_chckpt_ratio"],
warmup_steps=args["warmup_steps"],
save_steps=args["save_steps"],
save_total_limit=args["save_total_limit"],
logging_steps=args["logging_steps"],
checkpoint=args["checkpoint"],
skip=args["skip"],
verbose=args["verbose"],
txt_row_thd=args["txt_row_thd"],
use_eos_token=args["use_eos_token"]!=0,
groupsize=args["groupsize"],
v1=args["v1"],
local_rank=args["local_rank"],
flash_attention=args["flash_attention"],
backend=args["backend"],
)
+295 -292
View File
@@ -1,292 +1,295 @@
import matmul_utils_4bit as mm4b
import torch
import torch.nn as nn
import time
import math
from torch.cuda.amp import custom_bwd, custom_fwd
from colorama import init, Fore, Back, Style
init(autoreset=True)
class AutogradMatmul4bitCuda(torch.autograd.Function):
@staticmethod
@custom_fwd(cast_inputs=torch.float16)
def forward(ctx, x, qweight, scales, zeros, g_idx, bits, maxq):
ctx.save_for_backward(qweight, scales, zeros, g_idx)
if g_idx is None:
output = mm4b._matmul4bit_v1_recons(x, qweight, scales, zeros)
else:
output = mm4b._matmul4bit_v2_recons(x, qweight, scales, zeros, g_idx)
output = output.clone()
return output
@staticmethod
@custom_bwd
def backward(ctx, grad_output):
qweight, scales, zeros, g_idx = ctx.saved_tensors
if ctx.needs_input_grad[0]:
if g_idx is None:
grad = mm4b._matmul4bit_v1_recons(grad_output, qweight, scales, zeros, transpose=True)
else:
grad = mm4b._matmul4bit_v2_recons(grad_output, qweight, scales, zeros, g_idx, transpose=True)
return grad, None, None, None, None, None, None
try:
import triton_utils as tu
class AutogradMatmul4bitTriton(torch.autograd.Function):
@staticmethod
@custom_fwd(cast_inputs=torch.float16)
def forward(ctx, x, qweight, scales, qzeros, g_idx, bits, maxq):
output = tu.triton_matmul(x, qweight, scales, qzeros, g_idx, bits, maxq)
ctx.save_for_backward(qweight, scales, qzeros, g_idx)
ctx.bits, ctx.maxq = bits, maxq
output = output.clone()
return output
@staticmethod
@custom_bwd
def backward(ctx, grad_output):
qweight, scales, qzeros, g_idx = ctx.saved_tensors
bits, maxq = ctx.bits, ctx.maxq
grad_input = None
if ctx.needs_input_grad[0]:
grad_input = tu.triton_matmul_transpose(grad_output, qweight, scales, qzeros, g_idx, bits, maxq)
return grad_input, None, None, None, None, None, None
except ImportError:
print('Triton not found. Please run "pip install triton".')
AutogradMatmul4bit = AutogradMatmul4bitCuda
backend = 'cuda'
def switch_backend_to(to_backend):
global AutogradMatmul4bit
global backend
if to_backend == 'cuda':
AutogradMatmul4bit = AutogradMatmul4bitCuda
backend = 'cuda'
print(Style.BRIGHT + Fore.GREEN + 'Using CUDA implementation.')
elif to_backend == 'triton':
# detect if AutogradMatmul4bitTriton is defined
if 'AutogradMatmul4bitTriton' not in globals():
raise ValueError('Triton not found. Please install triton_utils.')
AutogradMatmul4bit = AutogradMatmul4bitTriton
backend = 'triton'
print(Style.BRIGHT + Fore.GREEN + 'Using Triton implementation.')
else:
raise ValueError('Backend not supported.')
def matmul4bit_with_backend(x, qweight, scales, qzeros, g_idx, bits, maxq):
if backend == 'cuda':
return mm4b.matmul4bit(x, qweight, scales, qzeros, g_idx)
elif backend == 'triton':
assert qzeros.dtype == torch.int32
return tu.triton_matmul(x, qweight, scales, qzeros, g_idx, bits, maxq)
else:
raise ValueError('Backend not supported.')
# Assumes layer is perfectly divisible into 256 * 256 blocks
class Autograd4bitQuantLinear(nn.Module):
def __init__(self, in_features, out_features, groupsize=-1, is_v1_model=False):
super().__init__()
bits = 4
self.in_features = in_features
self.out_features = out_features
self.bits = bits
self.maxq = 2 ** self.bits - 1
groupsize = groupsize if groupsize != -1 else in_features
self.groupsize = groupsize
self.is_v1_model = is_v1_model
if is_v1_model:
self.register_buffer('zeros', torch.empty((out_features, 1)))
self.register_buffer('scales', torch.empty((out_features, 1)))
self.g_idx = None
else:
self.register_buffer('qzeros',
torch.empty((math.ceil(in_features/groupsize), out_features // 256 * (bits * 8)), dtype=torch.int32)
)
self.register_buffer('scales', torch.empty((math.ceil(in_features/groupsize), out_features)))
self.register_buffer('g_idx', torch.tensor([i // self.groupsize for i in range(in_features)], dtype = torch.int32))
self.register_buffer('bias', torch.empty(out_features))
self.register_buffer(
'qweight', torch.empty((in_features // 256 * (bits * 8), out_features), dtype=torch.int32)
)
def forward(self, x):
if torch.is_grad_enabled():
out = AutogradMatmul4bit.apply(x, self.qweight, self.scales,
self.qzeros if not self.is_v1_model else self.zeros,
self.g_idx, self.bits, self.maxq)
else:
out = matmul4bit_with_backend(x, self.qweight, self.scales,
self.qzeros if not self.is_v1_model else self.zeros,
self.g_idx, self.bits, self.maxq)
out += self.bias
return out
def make_quant_for_4bit_autograd(module, names, name='', groupsize=-1, is_v1_model=False):
if isinstance(module, Autograd4bitQuantLinear):
return
for attr in dir(module):
tmp = getattr(module, attr)
name1 = name + '.' + attr if name != '' else attr
if name1 in names:
setattr(
module, attr, Autograd4bitQuantLinear(tmp.in_features, tmp.out_features, groupsize=groupsize, is_v1_model=is_v1_model)
)
for name1, child in module.named_children():
make_quant_for_4bit_autograd(child, names, name + '.' + name1 if name != '' else name1, groupsize=groupsize, is_v1_model=is_v1_model)
def model_to_half(model):
model.half()
for n, m in model.named_modules():
if isinstance(m, Autograd4bitQuantLinear):
if m.is_v1_model:
m.zeros = m.zeros.half()
m.scales = m.scales.half()
m.bias = m.bias.half()
print(Style.BRIGHT + Fore.YELLOW + 'Converted as Half.')
def model_to_float(model):
model.float()
for n, m in model.named_modules():
if isinstance(m, Autograd4bitQuantLinear):
if m.is_v1_model:
m.zeros = m.zeros.float()
m.scales = m.scales.float()
m.bias = m.bias.float()
print(Style.BRIGHT + Fore.YELLOW + 'Converted as Float.')
def find_layers(module, layers=[nn.Conv2d, nn.Linear], name=''):
if type(module) in layers:
return {name: module}
res = {}
for name1, child in module.named_children():
res.update(find_layers(
child, layers=layers, name=name + '.' + name1 if name != '' else name1
))
return res
def load_llama_model_4bit_low_ram(config_path, model_path, groupsize=-1, half=False, device_map="auto", seqlen=2048, is_v1_model=False):
import accelerate
from transformers import LlamaConfig, LlamaForCausalLM, LlamaTokenizer
print(Style.BRIGHT + Fore.CYAN + "Loading Model ...")
t0 = time.time()
with accelerate.init_empty_weights():
config = LlamaConfig.from_pretrained(config_path)
model = LlamaForCausalLM(config)
model = model.eval()
layers = find_layers(model)
for name in ['lm_head']:
if name in layers:
del layers[name]
make_quant_for_4bit_autograd(model, layers, groupsize=groupsize, is_v1_model=is_v1_model)
model = accelerate.load_checkpoint_and_dispatch(
model=model,
checkpoint=model_path,
device_map=device_map,
no_split_module_classes=["LlamaDecoderLayer"]
)
model.seqlen = seqlen
if half:
model_to_half(model)
tokenizer = LlamaTokenizer.from_pretrained(config_path)
tokenizer.truncation_side = 'left'
print(Style.BRIGHT + Fore.GREEN + f"Loaded the model in {(time.time()-t0):.2f} seconds.")
return model, tokenizer
def load_llama_model_4bit_low_ram_and_offload(config_path, model_path, lora_path=None, groupsize=-1, seqlen=2048, max_memory=None, is_v1_model=False):
import accelerate
from transformers import LlamaConfig, LlamaForCausalLM, LlamaTokenizer
if max_memory is None:
max_memory = {0: '24Gib', 'cpu': '48Gib'}
print(Style.BRIGHT + Fore.CYAN + "Loading Model ...")
t0 = time.time()
with accelerate.init_empty_weights():
config = LlamaConfig.from_pretrained(config_path)
model = LlamaForCausalLM(config)
model = model.eval()
layers = find_layers(model)
for name in ['lm_head']:
if name in layers:
del layers[name]
make_quant_for_4bit_autograd(model, layers, groupsize=groupsize, is_v1_model=is_v1_model)
accelerate.load_checkpoint_in_model(model, checkpoint=model_path, device_map={'': 'cpu'})
# rotary_emb fix
for n, m in model.named_modules():
if 'rotary_emb' in n:
cos_cached = m.cos_cached.clone().cpu()
sin_cached = m.sin_cached.clone().cpu()
break
if lora_path is not None:
from peft import PeftModel
from peft.tuners.lora import Linear4bitLt
model = PeftModel.from_pretrained(model, lora_path, device_map={'': 'cpu'}, torch_dtype=torch.float32)
print(Style.BRIGHT + Fore.GREEN + '{} Lora Applied.'.format(lora_path))
model.seqlen = seqlen
print('Apply half ...')
for n, m in model.named_modules():
if isinstance(m, Autograd4bitQuantLinear) or ((lora_path is not None) and isinstance(m, Linear4bitLt)):
if m.is_v1_model:
m.zeros = m.zeros.half()
m.scales = m.scales.half()
m.bias = m.bias.half()
print('Dispatching model ...')
device_map = accelerate.infer_auto_device_map(model, max_memory=max_memory, no_split_module_classes=["LlamaDecoderLayer"])
model = accelerate.dispatch_model(model, device_map=device_map, offload_buffers=True, main_device=0)
torch.cuda.empty_cache()
print(Style.BRIGHT + Fore.YELLOW + 'Total {:.2f} Gib VRAM used.'.format(torch.cuda.memory_allocated() / 1024 / 1024))
# rotary_emb fix
for n, m in model.named_modules():
if 'rotary_emb' in n:
if getattr(m, '_hf_hook', None):
if isinstance(m._hf_hook, accelerate.hooks.SequentialHook):
hooks = m._hf_hook.hooks
else:
hooks = [m._hf_hook]
for hook in hooks:
if hook.offload:
if n + '.sin_cached' not in hook.weights_map.dataset.state_dict.keys():
hook.weights_map.dataset.state_dict[n + '.sin_cached'] = sin_cached.clone().cpu()
hook.weights_map.dataset.state_dict[n + '.cos_cached'] = cos_cached.clone().cpu()
tokenizer = LlamaTokenizer.from_pretrained(config_path)
tokenizer.truncation_side = 'left'
print(Style.BRIGHT + Fore.GREEN + f"Loaded the model in {(time.time()-t0):.2f} seconds.")
return model, tokenizer
load_llama_model_4bit_low_ram_and_offload_to_cpu = load_llama_model_4bit_low_ram_and_offload
"""
from https://raw.githubusercontent.com/johnsmith0031/alpaca_lora_4bit/main/autograd_4bit.py
"""
import matmul_utils_4bit as mm4b
import torch
import torch.nn as nn
import time
import math
from torch.cuda.amp import custom_bwd, custom_fwd
from colorama import init, Fore, Back, Style
init(autoreset=True)
class AutogradMatmul4bitCuda(torch.autograd.Function):
@staticmethod
@custom_fwd(cast_inputs=torch.float16)
def forward(ctx, x, qweight, scales, zeros, g_idx, bits, maxq):
ctx.save_for_backward(qweight, scales, zeros, g_idx)
if g_idx is None:
output = mm4b._matmul4bit_v1_recons(x, qweight, scales, zeros)
else:
output = mm4b._matmul4bit_v2_recons(x, qweight, scales, zeros, g_idx)
output = output.clone()
return output
@staticmethod
@custom_bwd
def backward(ctx, grad_output):
qweight, scales, zeros, g_idx = ctx.saved_tensors
if ctx.needs_input_grad[0]:
if g_idx is None:
grad = mm4b._matmul4bit_v1_recons(grad_output, qweight, scales, zeros, transpose=True)
else:
grad = mm4b._matmul4bit_v2_recons(grad_output, qweight, scales, zeros, g_idx, transpose=True)
return grad, None, None, None, None, None, None
try:
import triton_utils as tu
class AutogradMatmul4bitTriton(torch.autograd.Function):
@staticmethod
@custom_fwd(cast_inputs=torch.float16)
def forward(ctx, x, qweight, scales, qzeros, g_idx, bits, maxq):
output = tu.triton_matmul(x, qweight, scales, qzeros, g_idx, bits, maxq)
ctx.save_for_backward(qweight, scales, qzeros, g_idx)
ctx.bits, ctx.maxq = bits, maxq
output = output.clone()
return output
@staticmethod
@custom_bwd
def backward(ctx, grad_output):
qweight, scales, qzeros, g_idx = ctx.saved_tensors
bits, maxq = ctx.bits, ctx.maxq
grad_input = None
if ctx.needs_input_grad[0]:
grad_input = tu.triton_matmul_transpose(grad_output, qweight, scales, qzeros, g_idx, bits, maxq)
return grad_input, None, None, None, None, None, None
except ImportError:
print('Triton not found. Please run "pip install triton".')
AutogradMatmul4bit = AutogradMatmul4bitCuda
backend = 'cuda'
def switch_backend_to(to_backend):
global AutogradMatmul4bit
global backend
if to_backend == 'cuda':
AutogradMatmul4bit = AutogradMatmul4bitCuda
backend = 'cuda'
print(Style.BRIGHT + Fore.GREEN + 'Using CUDA implementation.')
elif to_backend == 'triton':
# detect if AutogradMatmul4bitTriton is defined
if 'AutogradMatmul4bitTriton' not in globals():
raise ValueError('Triton not found. Please install triton_utils.')
AutogradMatmul4bit = AutogradMatmul4bitTriton
backend = 'triton'
print(Style.BRIGHT + Fore.GREEN + 'Using Triton implementation.')
else:
raise ValueError('Backend not supported.')
def matmul4bit_with_backend(x, qweight, scales, qzeros, g_idx, bits, maxq):
if backend == 'cuda':
return mm4b.matmul4bit(x, qweight, scales, qzeros, g_idx)
elif backend == 'triton':
assert qzeros.dtype == torch.int32
return tu.triton_matmul(x, qweight, scales, qzeros, g_idx, bits, maxq)
else:
raise ValueError('Backend not supported.')
# Assumes layer is perfectly divisible into 256 * 256 blocks
class Autograd4bitQuantLinear(nn.Module):
def __init__(self, in_features, out_features, groupsize=-1, is_v1_model=False):
super().__init__()
bits = 4
self.in_features = in_features
self.out_features = out_features
self.bits = bits
self.maxq = 2 ** self.bits - 1
groupsize = groupsize if groupsize != -1 else in_features
self.groupsize = groupsize
self.is_v1_model = is_v1_model
if is_v1_model:
self.register_buffer('zeros', torch.empty((out_features, 1)))
self.register_buffer('scales', torch.empty((out_features, 1)))
self.g_idx = None
else:
self.register_buffer('qzeros',
torch.empty((math.ceil(in_features/groupsize), out_features // 256 * (bits * 8)), dtype=torch.int32)
)
self.register_buffer('scales', torch.empty((math.ceil(in_features/groupsize), out_features)))
self.register_buffer('g_idx', torch.tensor([i // self.groupsize for i in range(in_features)], dtype = torch.int32))
self.register_buffer('bias', torch.empty(out_features))
self.register_buffer(
'qweight', torch.empty((in_features // 256 * (bits * 8), out_features), dtype=torch.int32)
)
def forward(self, x):
if torch.is_grad_enabled():
out = AutogradMatmul4bit.apply(x, self.qweight, self.scales,
self.qzeros if not self.is_v1_model else self.zeros,
self.g_idx, self.bits, self.maxq)
else:
out = matmul4bit_with_backend(x, self.qweight, self.scales,
self.qzeros if not self.is_v1_model else self.zeros,
self.g_idx, self.bits, self.maxq)
out += self.bias
return out
def make_quant_for_4bit_autograd(module, names, name='', groupsize=-1, is_v1_model=False):
if isinstance(module, Autograd4bitQuantLinear):
return
for attr in dir(module):
tmp = getattr(module, attr)
name1 = name + '.' + attr if name != '' else attr
if name1 in names:
setattr(
module, attr, Autograd4bitQuantLinear(tmp.in_features, tmp.out_features, groupsize=groupsize, is_v1_model=is_v1_model)
)
for name1, child in module.named_children():
make_quant_for_4bit_autograd(child, names, name + '.' + name1 if name != '' else name1, groupsize=groupsize, is_v1_model=is_v1_model)
def model_to_half(model):
model.half()
for n, m in model.named_modules():
if isinstance(m, Autograd4bitQuantLinear):
if m.is_v1_model:
m.zeros = m.zeros.half()
m.scales = m.scales.half()
m.bias = m.bias.half()
print(Style.BRIGHT + Fore.YELLOW + 'Converted as Half.')
def model_to_float(model):
model.float()
for n, m in model.named_modules():
if isinstance(m, Autograd4bitQuantLinear):
if m.is_v1_model:
m.zeros = m.zeros.float()
m.scales = m.scales.float()
m.bias = m.bias.float()
print(Style.BRIGHT + Fore.YELLOW + 'Converted as Float.')
def find_layers(module, layers=[nn.Conv2d, nn.Linear], name=''):
if type(module) in layers:
return {name: module}
res = {}
for name1, child in module.named_children():
res.update(find_layers(
child, layers=layers, name=name + '.' + name1 if name != '' else name1
))
return res
def load_llama_model_4bit_low_ram(config_path, model_path, groupsize=-1, half=False, device_map="auto", seqlen=2048, is_v1_model=False):
import accelerate
from transformers import LlamaConfig, LlamaForCausalLM, LlamaTokenizer
print(Style.BRIGHT + Fore.CYAN + "Loading Model ...")
t0 = time.time()
with accelerate.init_empty_weights():
config = LlamaConfig.from_pretrained(config_path)
model = LlamaForCausalLM(config)
model = model.eval()
layers = find_layers(model)
for name in ['lm_head']:
if name in layers:
del layers[name]
make_quant_for_4bit_autograd(model, layers, groupsize=groupsize, is_v1_model=is_v1_model)
model = accelerate.load_checkpoint_and_dispatch(
model=model,
checkpoint=model_path,
device_map=device_map,
no_split_module_classes=["LlamaDecoderLayer"]
)
model.seqlen = seqlen
if half:
model_to_half(model)
tokenizer = LlamaTokenizer.from_pretrained(config_path)
tokenizer.truncation_side = 'left'
print(Style.BRIGHT + Fore.GREEN + f"Loaded the model in {(time.time()-t0):.2f} seconds.")
return model, tokenizer
def load_llama_model_4bit_low_ram_and_offload(config_path, model_path, lora_path=None, groupsize=-1, seqlen=2048, max_memory=None, is_v1_model=False):
import accelerate
from transformers import LlamaConfig, LlamaForCausalLM, LlamaTokenizer
if max_memory is None:
max_memory = {0: '24Gib', 'cpu': '48Gib'}
print(Style.BRIGHT + Fore.CYAN + "Loading Model ...")
t0 = time.time()
with accelerate.init_empty_weights():
config = LlamaConfig.from_pretrained(config_path)
model = LlamaForCausalLM(config)
model = model.eval()
layers = find_layers(model)
for name in ['lm_head']:
if name in layers:
del layers[name]
make_quant_for_4bit_autograd(model, layers, groupsize=groupsize, is_v1_model=is_v1_model)
accelerate.load_checkpoint_in_model(model, checkpoint=model_path, device_map={'': 'cpu'})
# rotary_emb fix
for n, m in model.named_modules():
if 'rotary_emb' in n:
cos_cached = m.cos_cached.clone().cpu()
sin_cached = m.sin_cached.clone().cpu()
break
if lora_path is not None:
from peft import PeftModel
from peft.tuners.lora import Linear4bitLt
model = PeftModel.from_pretrained(model, lora_path, device_map={'': 'cpu'}, torch_dtype=torch.float32)
print(Style.BRIGHT + Fore.GREEN + '{} Lora Applied.'.format(lora_path))
model.seqlen = seqlen
print('Apply half ...')
for n, m in model.named_modules():
if isinstance(m, Autograd4bitQuantLinear) or ((lora_path is not None) and isinstance(m, Linear4bitLt)):
if m.is_v1_model:
m.zeros = m.zeros.half()
m.scales = m.scales.half()
m.bias = m.bias.half()
print('Dispatching model ...')
device_map = accelerate.infer_auto_device_map(model, max_memory=max_memory, no_split_module_classes=["LlamaDecoderLayer"])
model = accelerate.dispatch_model(model, device_map=device_map, offload_buffers=True, main_device=0)
torch.cuda.empty_cache()
print(Style.BRIGHT + Fore.YELLOW + 'Total {:.2f} Gib VRAM used.'.format(torch.cuda.memory_allocated() / 1024 / 1024))
# rotary_emb fix
for n, m in model.named_modules():
if 'rotary_emb' in n:
if getattr(m, '_hf_hook', None):
if isinstance(m._hf_hook, accelerate.hooks.SequentialHook):
hooks = m._hf_hook.hooks
else:
hooks = [m._hf_hook]
for hook in hooks:
if hook.offload:
if n + '.sin_cached' not in hook.weights_map.dataset.state_dict.keys():
hook.weights_map.dataset.state_dict[n + '.sin_cached'] = sin_cached.clone().cpu()
hook.weights_map.dataset.state_dict[n + '.cos_cached'] = cos_cached.clone().cpu()
tokenizer = LlamaTokenizer.from_pretrained(config_path)
tokenizer.truncation_side = 'left'
print(Style.BRIGHT + Fore.GREEN + f"Loaded the model in {(time.time()-t0):.2f} seconds.")
return model, tokenizer
load_llama_model_4bit_low_ram_and_offload_to_cpu = load_llama_model_4bit_low_ram_and_offload
-167
View File
@@ -1,167 +0,0 @@
#https://github.com/fpgaminer/GPTQ-triton
"""
Mostly the same as the autotuner in Triton, but with a few changes like using 40 runs instead of 100.
"""
import builtins
import math
import time
from typing import Dict
import triton
class Autotuner(triton.KernelInterface):
def __init__(self, fn, arg_names, configs, key, reset_to_zero, prune_configs_by: Dict = None, nearest_power_of_two: bool = False):
'''
:param prune_configs_by: a dict of functions that are used to prune configs, fields:
'perf_model': performance model used to predicate running time with different configs, returns running time
'top_k': number of configs to bench
'prune_num_stages_by'(optional): a function used to prune num_stages. It take configs:List[Config] as its input, and returns pruned configs.
'nearest_power_of_two'(optional): whether to round key arguments to the nearest power of two when caching tuning results
'''
if not configs:
self.configs = [triton.Config({}, num_warps=4, num_stages=2)]
else:
self.configs = configs
self.key_idx = [arg_names.index(k) for k in key]
self.nearest_power_of_two = nearest_power_of_two
self.cache = {}
# hook to reset all required tensor to zeros before relaunching a kernel
self.hook = lambda args: 0
if reset_to_zero is not None:
self.reset_idx = [arg_names.index(k) for k in reset_to_zero]
def _hook(args):
for i in self.reset_idx:
args[i].zero_()
self.hook = _hook
self.arg_names = arg_names
# prune configs
if prune_configs_by:
perf_model, top_k = prune_configs_by['perf_model'], prune_configs_by['top_k']
if 'early_config_prune' in prune_configs_by:
early_config_prune = prune_configs_by['early_config_prune']
else:
perf_model, top_k, early_config_prune = None, None, None
self.perf_model, self.configs_top_k = perf_model, top_k
self.early_config_prune = early_config_prune
self.fn = fn
def _bench(self, *args, config, **meta):
# check for conflicts, i.e. meta-parameters both provided
# as kwargs and by the autotuner
conflicts = meta.keys() & config.kwargs.keys()
if conflicts:
raise ValueError(
f"Conflicting meta-parameters: {', '.join(conflicts)}."
" Make sure that you don't re-define auto-tuned symbols."
)
# augment meta-parameters with tunable ones
current = dict(meta, **config.kwargs)
def kernel_call():
if config.pre_hook:
config.pre_hook(self.nargs)
self.hook(args)
self.fn.run(*args, num_warps=config.num_warps, num_stages=config.num_stages, **current)
try:
# In testings using only 40 reps seems to be close enough and it appears to be what PyTorch uses
# PyTorch also sets fast_flush to True, but I didn't see any speedup so I'll leave the default
return triton.testing.do_bench(kernel_call, rep=40)
except triton.compiler.OutOfResources:
return float('inf')
def run(self, *args, **kwargs):
self.nargs = dict(zip(self.arg_names, args))
if len(self.configs) > 1:
key = tuple(args[i] for i in self.key_idx)
# This reduces the amount of autotuning by rounding the keys to the nearest power of two
# In my testing this gives decent results, and greatly reduces the amount of tuning required
if self.nearest_power_of_two:
key = tuple([2 ** int(math.log2(x) + 0.5) for x in key])
if key not in self.cache:
# prune configs
pruned_configs = self.prune_configs(kwargs)
bench_start = time.time()
timings = {config: self._bench(*args, config=config, **kwargs)
for config in pruned_configs}
bench_end = time.time()
self.bench_time = bench_end - bench_start
self.cache[key] = builtins.min(timings, key=timings.get)
self.hook(args)
self.configs_timings = timings
config = self.cache[key]
else:
config = self.configs[0]
self.best_config = config
if config.pre_hook is not None:
config.pre_hook(self.nargs)
return self.fn.run(*args, num_warps=config.num_warps, num_stages=config.num_stages, **kwargs, **config.kwargs)
def prune_configs(self, kwargs):
pruned_configs = self.configs
if self.early_config_prune:
pruned_configs = self.early_config_prune(self.configs, self.nargs)
if self.perf_model:
top_k = self.configs_top_k
if isinstance(top_k, float) and top_k <= 1.0:
top_k = int(len(self.configs) * top_k)
if len(pruned_configs) > top_k:
est_timing = {
config: self.perf_model(**self.nargs, **kwargs, **config.kwargs, num_stages=config.num_stages,
num_warps=config.num_warps)
for config in pruned_configs
}
pruned_configs = sorted(est_timing.keys(), key=lambda x: est_timing[x])[:top_k]
return pruned_configs
def warmup(self, *args, **kwargs):
self.nargs = dict(zip(self.arg_names, args))
for config in self.prune_configs(kwargs):
self.fn.warmup(
*args,
num_warps=config.num_warps,
num_stages=config.num_stages,
**kwargs,
**config.kwargs,
)
self.nargs = None
def autotune(configs, key, prune_configs_by=None, reset_to_zero=None, nearest_power_of_two=False):
"""
Decorator for auto-tuning a :code:`triton.jit`'d function.
.. highlight:: python
.. code-block:: python
@triton.autotune(configs=[
triton.Config(meta={'BLOCK_SIZE': 128}, num_warps=4),
triton.Config(meta={'BLOCK_SIZE': 1024}, num_warps=8),
],
key=['x_size'] # the two above configs will be evaluated anytime
# the value of x_size changes
)
@triton.jit
def kernel(x_ptr, x_size, **META):
BLOCK_SIZE = META['BLOCK_SIZE']
:note: When all the configurations are evaluated, the kernel will run multiple time.
This means that whatever value the kernel updates will be updated multiple times.
To avoid this undesired behavior, you can use the `reset_to_zero` argument, which
reset the value of the provided tensor to `zero` before running any configuration.
:param configs: a list of :code:`triton.Config` objects
:type configs: list[triton.Config]
:param key: a list of argument names whose change in value will trigger the evaluation of all provided configs.
:type key: list[str]
:param prune_configs_by: a dict of functions that are used to prune configs, fields:
'perf_model': performance model used to predicate running time with different configs, returns running time
'top_k': number of configs to bench
'early_config_prune'(optional): a function used to do early prune (eg, num_stages). It take configs:List[Config] as its input, and returns pruned configs.
:param reset_to_zero: a list of argument names whose value will be reset to zero before evaluating any configs.
:type reset_to_zero: list[str]
"""
def decorator(fn):
return Autotuner(fn, fn.arg_names, configs, key, reset_to_zero, prune_configs_by, nearest_power_of_two)
return decorator
-106
View File
@@ -1,106 +0,0 @@
The alpaca (Lama pacos) is a species of South American camelid mammal. It is similar to, and often confused with, the llama. However, alpacas are often noticeably smaller than llamas. The two animals are closely related and can successfully crossbreed. Both species are believed to have been domesticated from their wild relatives, the vicuña and guanaco. There are two breeds of alpaca: the Suri alpaca and the Huacaya alpaca.
Alpacas are kept in herds that graze on the level heights of the Andes of Southern Peru, Western Bolivia, Ecuador, and Northern Chile at an altitude of 3,500 to 5,000 metres (11,000 to 16,000 feet) above sea level.[1] Alpacas are considerably smaller than llamas, and unlike llamas, they were not bred to be working animals, but were bred specifically for their fiber.
Alpaca fiber is used for making knitted and woven items, similar to sheep's wool. These items include blankets, sweaters, hats, gloves, scarves, a wide variety of textiles, and ponchos, in South America, as well as sweaters, socks, coats, and bedding in other parts of the world. The fiber comes in more than 52 natural colors as classified in Peru, 12 as classified in Australia, and 16 as classified in the United States.
Alpacas communicate through body language. The most common is spitting to show dominance[2] when they are in distress, fearful, or feel agitated. Male alpacas are more aggressive than females, and tend to establish dominance within their herd group. In some cases, alpha males will immobilize the head and neck of a weaker or challenging male in order to show their strength and dominance.
In the textile industry, "alpaca" primarily refers to the hair of Peruvian alpacas, but more broadly it refers to a style of fabric originally made from alpaca hair, such as mohair, Icelandic sheep wool, or even high-quality wool from other breeds of sheep. In trade, distinctions are made between alpacas and the several styles of mohair and luster.[3]
An adult alpaca generally is between 81 and 99 centimetres (32 and 39 inches) in height at the shoulders (withers). They usually weigh between 48 and 90 kilograms (106 and 198 pounds).[4] Raised in the same conditions, the difference in weight can be small with males weighting around 22.3 kilograms (49 lb 3 oz) and females 21.3 kilograms (46 lb 15 oz).[5]
The relationship between alpacas and vicuñas was disputed for many years. In the 18th and 19th centuries, the four South American lamoid species were assigned scientific names. At that time, the alpaca was assumed to be descended from the llama, ignoring similarities in size, fleece and dentition between the alpaca and the vicuña. Classification was complicated by the fact that all four species of South American camelid can interbreed and produce fertile offspring.[6] The advent of DNA technology made a more accurate classification possible.
In 2001, the alpaca genus classification changed from Lama pacos to Vicugna pacos, following the presentation of a paper[7] on work by Miranda Kadwell et al. on alpaca DNA to the Royal Society showing the alpaca is descended from the vicuña, not the guanaco.
Alpacas were domesticated thousands of years ago. The Moche people of Northern Peru often used alpaca images in their art.[8] There are no known wild alpacas, and its closest living relative, the vicuña (also native to South America), is the wild ancestor of the alpaca.
The family Camelidae first appeared in Americas 4045 million years ago, during the Eocene period, from the common ancestor, Protylopus. The descendants divided into Camelini and Lamini tribes, taking different migratory patterns to Asia and South America, respectively. Although the camelids became extinct in North America around 3 million years ago, it flourished in the South with the species we see today.[9] It was not until 25 million years ago, during the Pliocene, that the genus Hemiauchenia of the tribe Lamini split into Palaeolama and Lama; the latter would then split again into Lama and Vicugna upon migrating down to South America.
Remains of vicuña and guanaco have been found throughout Peru for around 12,000 years. Their domesticated counterparts, the llama and alpacas, have been found mummified in the Moquegua valley, in the south of Peru, dating back 900 to 1000 years. Mummies found in this region show two breeds of alpacas. More precise analysis of bone and teeth of these mummies has demonstrated that alpacas were domesticated from the Vicugna vicugna. Other research, considering the behavioral and morphological characteristics of alpacas and their wild counterparts, seems to indicate that alpacas could find their origins in Lama guanicoe as well as Vicugna vicugna, or even a hybrid of both.
Genetic analysis shows a different picture of the origins of the alpaca. Analysis of mitochondrial DNA shows that most alpacas have guanaco mtDNA, and many also have vicuña mtDNA. But microsatellite data shows that alpaca DNA is much more similar to vicuña DNA than to guanaco DNA. This suggests that alpacas are descendants of the Vicugna vicugna, not of the Lama guanicoe. The discrepancy with mtDNA seems to be a result of the fact that mtDNA is only transmitted by the mother, and recent husbandry practices have caused hybridization between llamas (which primarily carry guanaco DNA) and alpacas. To the extent that many of today's domestic alpacas are the result of male alpacas bred to female llamas, this would explain the mtDNA consistent with guanacos. This situation has led to attempts to reclassify the alpaca as Vicugna pacos.[7]
The alpaca comes in two breeds, Suri and Huacaya, based on their fibers rather than scientific or European classifications.
Huacaya alpacas are the most commonly found, constituting about 90% of the population.[10] The Huacaya alpaca is thought to have originated in post-colonial Peru. This is due to their thicker fleece which makes them more suited to survive in the higher altitudes of the Andes after being pushed into the highlands of Peru with the arrival of the Spanish.[11][better source needed]
Suri alpacas represent a smaller portion of the total alpaca population, around 10%.[10] They are thought to have been more prevalent in pre-Columbian Peru since they could be kept at a lower altitude where a thicker fleece was not needed for harsh weather conditions.[11][better source needed]
Alpacas are social herd animals that live in family groups, consisting of a territorial alpha male, females, and their young ones. Alpacas warn the herd about intruders by making sharp, noisy inhalations that sound like a high-pitched bray. The herd may attack smaller predators with their front feet and can spit and kick. Their aggression towards members of the canid family (coyotes, foxes, dogs etc.) is exploited when alpacas are used as guard llamas for guarding sheep.[12][13]
Alpacas can sometimes be aggressive, but they can also be very gentle, intelligent, and extremely observant. For the most part, alpacas are very quiet, but male alpacas are more energetic when they get involved in fighting with other alpacas.[14] When they prey, they are cautious but also nervous when they feel any type of threat. They can feel threatened when a person or another alpaca comes up from behind them.[15][better source needed]
Alpacas set their own boundaries of "personal space" within their families and groups.[16] They make a hierarchy in some sense, and each alpaca is aware of the dominant animals in each group.[14] Body language is the key to their communication. It helps to maintain their order. One example of their body communication includes a pose named broadside, where their ears are pulled back and they stand sideways. This pose is used when male alpacas are defending their territory.[2]
When they are young, they tend to follow larger objects and to sit near or under them. An example of this is a baby alpaca with its mother. This can also apply when an alpaca passes by an older alpaca.[16]
Training
Alpacas are generally very trainable and usually respond to reward, most commonly in the form of food. They can usually be petted without getting agitated, especially if one avoids petting the head or neck. Alpacas are usually quite easy to herd, even in large groups. However, during herding, it is recommended for the handler to approach the animals slowly and quietly, as failing to do so can result in danger for both the animals and the handler.[17]
Alpacas and llamas have started showing up in U.S. nursing homes and hospitals as trained, certified therapy animals. The Mayo Clinic says animal-assisted therapy can reduce pain, depression, anxiety, and fatigue. This type of animal therapy is growing in popularity, and there are several organizations throughout the United States that participate.[18]
Spitting
Not all alpacas spit, but all are capable of doing so. "Spit" is somewhat euphemistic; occasionally the projectile contains only air and a little saliva, although alpacas commonly bring up acidic stomach contents (generally a green, grassy mix) and project it onto their chosen targets. Spitting is mostly reserved for other alpacas, but an alpaca will also occasionally spit at a human.
Spitting can result in what is called "sour mouth". Sour mouth is characterized by "a loose-hanging lower lip and a gaping mouth."[19]
Alpacas can spit for several reasons. A female alpaca spits when she is not interested in a male alpaca, typically when she thinks that she is already impregnated. Both sexes of alpaca keep others away from their food, or anything they have their eyes on. Most give a slight warning before spitting by blowing air out and raising their heads, giving their ears a "pinned" appearance.[16]
Alpacas can spit up to ten feet if they need to. For example, if another animal does not back off, the alpaca will throw up its stomach contents, resulting in a lot of spit.[20]
Some signs of stress which can lead to their spitting habits include: humming, a wrinkle under their eye, drooling, rapid breathing, and stomping their feet. When alpacas show any sign of interest or alertness, they tend to sniff their surroundings, watch closely, or stand quietly in place and stare.[20]
When it comes to reproduction, they spit because it is a response triggered by the progesterone levels being increased, which is associated with ovulation.[21]
Hygiene
Alpacas use a communal dung pile,[22] where they do not graze. This behaviour tends to limit the spread of internal parasites. Generally, males have much tidier, and fewer dung piles than females, which tend to stand in a line and all go at once. One female approaches the dung pile and begins to urinate and/or defecate, and the rest of the herd often follows. Alpaca waste is collected and used as garden fertilizer or even natural fertilizer.[2]
Because of their preference for using a dung pile for excreting bodily waste, some alpacas have been successfully house-trained.[23]
Alpacas develop dental hygiene problems which affect their eating and behavior. Warning signs include protracted chewing while eating, or food spilling out of their mouths. Poor body condition and sunken cheeks are also telltales of dental problems.
Alpacas make a variety of sounds:
Humming: When alpacas are born, the mother and baby hum constantly. They also hum as a sign of distress, especially when they are separated from their herd. Alpacas may also hum when curious, happy, worried or cautious.
Snorting: Alpacas snort when another alpaca is invading its space.
Grumbling: Alpacas grumble to warn each other. For example, when one is invading another's personal space, it sounds like gurgling.
Clucking: Similar to a hen's cluck, alpacas cluck when a mother is concerned for her cria. Male alpacas cluck to signal friendly behavior.[2]
Screaming: Their screams are extremely deafening and loud. They will scream when they are not handled correctly or when they are being attacked by a potential enemy.
Screeching: A bird-like cry, presumably intended to terrify the opponent. This sound is typically used by male alpacas when they are in a fight over dominance. When a female screeches, it is more of a growl when she is angry.
Females are induced ovulators;[24] meaning the act of mating and the presence of semen causes them to ovulate. Females usually conceive after just one breeding, but occasionally do have trouble conceiving. Artificial insemination is technically difficult, expensive and not common, but it can be accomplished. Embryo transfer is more widespread.
A male is usually ready to mate for the first time between two and three years of age. It is not advisable to allow a young female to be bred until she is mature and has reached two-thirds of her mature weight. Over-breeding a young female before conception is possibly a common cause of uterine infections. As the age of maturation varies greatly between individuals, it is usually recommended that novice breeders wait until females are 18 months of age or older before initiating breeding.[25]
Alpacas can breed at any time throughout the year but it is more difficult to breed in the winter. Most breed during autumn or late spring. The most popular way to have alpacas mate is pen mating. Pen mating is when they move both the female and the desired male into a pen. Another way is paddock mating where one male alpaca is let loose in the paddock with several female alpacas.
The gestation period is, on average, 11.5 months, and usually results in a single offspring, or cria. Twins are rare, occurring about once per 1000 deliveries.[26] Cria are generally between 15 and 19 pounds, and are standing 30 to 90 minutes after birth.[27] After a female gives birth, she is generally receptive to breeding again after about two weeks. Crias may be weaned through human intervention at about six months old and 60 pounds, but many breeders prefer to allow the female to decide when to wean her offspring; they can be weaned earlier or later depending on their size and emotional maturity.
The average lifespan of an alpaca is between 1520 years, and the longest-lived alpaca on record is 27 years.[28]
Cattle tuberculosis can also infect alpacas: Mycobacterium bovis also causes TB in this species worldwide.[29] KrajewskaWędzina et al., 2020 detect M. bovis in individuals traded from the United Kingdom to Poland.[29] To accomplish this they develop a seroassay which correctly identifies positive subjects which are false negative for a common skin test.[29] KrajewskaWędzina et al. also find that alpacas are unusual in mounting a competent early-infection immune response.[29] Bernitz et al., 2021 believe this to generalise to all camelids.[29]
Alpacas can be found throughout most of South America.[30] They typically live in temperate conditions in the mountains with high altitudes.
They are easy to care for since they are not limited to a specific type of environment. Animals such as flamingos, condors, spectacled bears, mountain lions, coyotes, llamas, and sheep live near alpacas when they are in their natural habitat.
Alpacas are native to Peru, but can be found throughout the globe in captivity.[30] Peru currently has the largest alpaca population, with over half the world's animals.[31] The population declined drastically after the Spanish Conquistadors invaded the Andes mountains in 1532, after which 98% of the animals were destroyed. The Spanish also brought with them diseases that were fatal to alpacas.[32]
European conquest forced the animals to move higher into the mountains,[how?] which remained there permanently. Although alpacas had almost been wiped out completely, they were rediscovered sometime during the 19th century by Europeans. After finding uses for them, the animals became important to societies during the industrial revolution.[33]
Nuzzle and Scratch was a British children's television programme featuring two fictional alpacas that was first broadcast between 2008 and 2011.[34]
Interest in alpacas grew as a result of Depp v. Heard, the 2022 trial in which Johnny Depp sued Amber Heard for defamation in Virginia after Heard wrote an op-ed saying she was a public victim of domestic violence. Depp testified, under oath, that he would not make another Pirates of the Caribbean film for "300 million dollars and a million alpacas".[35][36][37]
Alpacas chew their food which ends up being mixed with their cud and saliva and then they swallow it. Alpacas usually eat 1.5% of their body weight daily for normal growth.[38] They mainly need pasture grass, hay, or silage but some may also need supplemental energy and protein foods and they will also normally try to chew on almost anything (e.g. empty bottle). Most alpaca ranchers rotate their feeding grounds so the grass can regrow and fecal parasites may die before reusing the area. Pasture grass is a great source of protein. When seasons change, the grass loses or gains more protein. For example, in the spring, the pasture grass has about 20% protein while in the summer, it only has 6%.[38] They need more energy supplements in the winter to produce body heat and warmth. They get their fiber from hay or from long stems which provides them with vitamin E. Green grass contains vitamin A and E.
Alpacas can eat natural unfertilized grass; however, ranchers can also supplement grass with low-protein grass hay. To provide selenium and other necessary vitamins, ranchers will feed their domestic alpacas a daily dose of grain to provide additional nutrients that are not fully obtained from their primary diet.[39] Alpacas may obtain the necessary vitamins in their native grazing ranges.
Alpacas, like other camelids, have a three-chambered stomach; combined with chewing cud, this three-chambered system allows maximum extraction of nutrients from low-quality forages. Alpacas are not ruminants, pseudo-ruminants, or modified ruminants, as there are many differences between the anatomy and physiology of a camelid and a ruminant stomach.[40]
Alpacas will chew their food in a figure eight motion, swallow the food, and then pass it into one of the stomach's chambers. The first and second chambers (called C1 and C2) are anaerobic fermentation chambers where the fermentation process begins. The alpaca will further absorb nutrients and water in the first part of the third chamber. The end of the third chamber (called C3) is where the stomach secretes acids to digest food and is the likely place where an alpaca will have ulcers if stressed.
Many plants are poisonous to the alpaca, including the bracken fern, Madagascar ragwort, oleander, and some azaleas. In common with similar livestock, others include acorns, African rue, agave, amaryllis, autumn crocus, bear grass, broom snakeweed, buckwheat, ragweed, buttercups, calla lily, orange tree foliage, carnations, castor beans, and many others.[41]
-178
View File
@@ -1,178 +0,0 @@
"""
llama-4b trainer with support of Stanford Alpaca-like JSON datasets (short for SAD)
Intended to use with https://github.com/johnsmith0031/alpaca_lora_4bit
SAD structure:
[
{
"instruction": "Give null hypothesis",
"input": "6 subjects were given a drug (treatment group) and an additional 6 subjects a placebo (control group).",
"output": "Drug is equivalent of placebo"
},
{
"instruction": "What does RNA stand for?",
"input": "",
"output": "RNA stands for ribonucleic acid."
}
]
"""
# Early load config to replace attn if needed
from arg_parser import get_config
ft_config = get_config()
if ft_config.flash_attention:
from monkeypatch.llama_flash_attn_monkey_patch import replace_llama_attn_with_flash_attn
replace_llama_attn_with_flash_attn()
import autograd_4bit
if ft_config.backend.lower() == 'triton':
autograd_4bit.switch_backend_to('triton')
else:
autograd_4bit.switch_backend_to('cuda')
import sys
import peft
import peft.tuners.lora
assert peft.tuners.lora.is_gptq_available()
import torch
import transformers
from autograd_4bit import load_llama_model_4bit_low_ram
from peft import LoraConfig, get_peft_model, get_peft_model_state_dict, PeftModel
# ! Config
import train_data
# * Show loaded parameters
if ft_config.local_rank == 0:
print(f"{ft_config}\n")
if ft_config.gradient_checkpointing:
print('Disable Dropout.')
# Load Basic Model
model, tokenizer = load_llama_model_4bit_low_ram(ft_config.llama_q4_config_dir,
ft_config.llama_q4_model,
device_map=ft_config.device_map,
groupsize=ft_config.groupsize,
is_v1_model=ft_config.v1)
# Config Lora
lora_config = LoraConfig(
r=ft_config.lora_r,
lora_alpha=ft_config.lora_alpha,
target_modules=["q_proj", "v_proj"],
lora_dropout=ft_config.lora_dropout,
bias="none",
task_type="CAUSAL_LM",
)
if ft_config.lora_apply_dir is None:
model = get_peft_model(model, lora_config)
else:
device_map = ft_config.device_map
if ft_config.ddp:
device_map = {'': 0}
else:
if torch.cuda.device_count() > 1:
device_map = "auto"
else:
device_map = {'': 0}
print('Device map for lora:', device_map)
model = PeftModel.from_pretrained(model, ft_config.lora_apply_dir, device_map=device_map, torch_dtype=torch.float32)
print(ft_config.lora_apply_dir, 'loaded')
# Scales to half
print('Fitting 4bit scales and zeros to half')
for n, m in model.named_modules():
if '4bit' in str(type(m)):
if m.groupsize == -1:
m.zeros = m.zeros.half()
m.scales = m.scales.half()
# Set tokenizer
tokenizer.pad_token_id = 0
if not ft_config.skip:
# Load Data
data = None
if ft_config.ds_type == "txt" and not ft_config.skip:
#### LLaMa
data = train_data.TrainTxt(ft_config.dataset, ft_config.val_set_size, tokenizer, ft_config.cutoff_len)
elif ft_config.ds_type == "alpaca" and not ft_config.skip:
#### Stanford Alpaca-like Data
data = train_data.TrainSAD(ft_config.dataset, ft_config.val_set_size, tokenizer, ft_config.cutoff_len)
elif ft_config.ds_type == "gpt4all" and not ft_config.skip:
#### GPT4All Data
data = train_data.TrainGPT4All(ft_config.dataset, ft_config.val_set_size, tokenizer, ft_config.cutoff_len)
else:
raise NotImplementedError("ERROR: Unknown dataset format")
data.prepare_data(thd=ft_config.txt_row_thd, use_eos_token=ft_config.use_eos_token)
####
# Use gradient checkpointing
if ft_config.gradient_checkpointing:
print('Applying gradient checkpointing ...')
from gradient_checkpointing import apply_gradient_checkpointing
apply_gradient_checkpointing(model, checkpoint_ratio=ft_config.gradient_checkpointing_ratio)
# Disable Trainer's DataParallel for multigpu
if not ft_config.ddp and torch.cuda.device_count() > 1:
model.is_parallelizable = True
model.model_parallel = True
training_arguments = transformers.TrainingArguments(
per_device_train_batch_size=ft_config.mbatch_size,
gradient_accumulation_steps=ft_config.gradient_accumulation_steps,
warmup_steps=ft_config.warmup_steps,
optim="adamw_torch",
num_train_epochs=ft_config.epochs,
learning_rate=ft_config.lr,
fp16=True,
logging_steps=ft_config.logging_steps,
evaluation_strategy="no",
save_strategy="steps",
eval_steps=None,
save_steps=ft_config.save_steps,
output_dir=ft_config.lora_out_dir,
save_total_limit=ft_config.save_total_limit,
load_best_model_at_end=False,
ddp_find_unused_parameters=False if ft_config.ddp else None,
)
trainer = transformers.Trainer(
model=model,
train_dataset=data.train_data,
eval_dataset=data.val_data,
args=training_arguments,
data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
model.config.use_cache = False
# Set Model dict
old_state_dict = model.state_dict
model.state_dict = (
lambda self, *_, **__: get_peft_model_state_dict(self, old_state_dict())
).__get__(model, type(model))
# Set Verbose
if ft_config.verbose:
transformers.logging.set_verbosity_info()
# Run Trainer
if ft_config.resume_checkpoint:
print('Resuming from {} ...'.format(ft_config.resume_checkpoint))
trainer.train(ft_config.resume_checkpoint)
else:
trainer.train()
print('Train completed.')
# Save Model
model.save_pretrained(ft_config.lora_out_dir)
if ft_config.checkpoint:
print("Warning: Merge model + LoRA and save the whole checkpoint not implemented yet.")
print('Model Saved.')
-61
View File
@@ -1,61 +0,0 @@
from transformers.models.llama.modeling_llama import LlamaDecoderLayer
from torch.utils.checkpoint import checkpoint
from torch.autograd import Variable
import torch
from torch import nn
import numpy as np
class NewForward:
def __init__(self, layer):
self.layer = layer
self.apply_patch()
def apply_patch(self):
self.layer.old_forward_for_cp = self.layer.forward
self.layer.forward = self.new_forward
def new_forward(self, *args, **kwargs):
def func(*args):
return self.layer.old_forward_for_cp(*args, **kwargs)
output = checkpoint(func, *args)
return output
class VarWrapper:
def __init__(self, model):
self.model = model
self.apply_patch()
print('Var Wrapper Patch Applied')
def apply_patch(self):
self.model.old_forward_for_cp = self.model.forward
self.model.forward = self.new_forward
def new_forward(self, *args, **kwargs):
out = self.model.old_forward_for_cp(*args, **kwargs)
out = Variable(out.data, requires_grad=True)
return out
def apply_gradient_checkpointing(model, checkpoint_ratio=1):
new_forwards = []
modules = []
for n, m in model.named_modules():
if isinstance(m, LlamaDecoderLayer):
modules.append(m)
if checkpoint_ratio < 1 and checkpoint_ratio > 0:
checkpoint_locs = np.array((np.linspace(0, 1, int(len(modules) * checkpoint_ratio)) * (len(modules)-1)).round(), dtype=int)
else:
checkpoint_locs = np.arange(len(modules))
for i in checkpoint_locs:
m = modules[i]
new_forwards.append(NewForward(m))
print('Forward Patch Applied For Block {}'.format(i))
for n, m in model.named_modules():
if isinstance(m, torch.nn.Embedding):
wrapper = VarWrapper(m)
break
return new_forwards, wrapper
-44
View File
@@ -1,44 +0,0 @@
import os
import sys
import time
import torch
from autograd_4bit import load_llama_model_4bit_low_ram, Autograd4bitQuantLinear
config_path = './llama-13b-4bit/'
model_path = './llama-13b-4bit.pt'
model, tokenizer = load_llama_model_4bit_low_ram(config_path, model_path, groupsize=-1)
print('Fitting 4bit scales and zeros to half')
model.half()
for n, m in model.named_modules():
if isinstance(m, Autograd4bitQuantLinear):
if m.groupsize == -1:
m.zeros = m.zeros.half()
m.scales = m.scales.half()
m.bias = m.bias.half()
print('Apply AMP Wrapper ...')
from amp_wrapper import AMPWrapper
wrapper = AMPWrapper(model)
wrapper.apply_generate()
prompt = '''I think the meaning of life is'''
batch = tokenizer(prompt, return_tensors="pt", add_special_tokens=False)
batch = {k: v.cuda() for k, v in batch.items()}
start = time.time()
with torch.no_grad():
generated = model.generate(inputs=batch["input_ids"],
do_sample=True, use_cache=True,
repetition_penalty=1.1,
max_new_tokens=20,
temperature=0.9,
top_p=0.95,
top_k=40,
return_dict_in_generate=True,
output_attentions=False,
output_hidden_states=False,
output_scores=False)
result_text = tokenizer.decode(generated['sequences'].cpu().tolist()[0])
end = time.time()
print(result_text)
print(end - start)
-143
View File
@@ -1,143 +0,0 @@
import torch
import numpy as np
from gptq_llama import quant_cuda
# Global Buffer
buffer_mat_dic = {}
use_new = True
auto_switch = True
auto_switch_thd = 8
debug = False
def get_buffer(shape_of_qweight, dtype=torch.float16, device='cuda'):
if shape_of_qweight not in buffer_mat_dic.keys():
buffer_mat_dic[shape_of_qweight] = torch.zeros((shape_of_qweight[0] * 8, shape_of_qweight[1]), dtype=dtype, device=device)
else:
if buffer_mat_dic[shape_of_qweight].device != device:
buffer_mat_dic[shape_of_qweight] = buffer_mat_dic[shape_of_qweight].to(device)
if buffer_mat_dic[shape_of_qweight].dtype != dtype:
buffer_mat_dic[shape_of_qweight] = buffer_mat_dic[shape_of_qweight].to(dtype=dtype)
return buffer_mat_dic[shape_of_qweight]
def _matmul4bit_v1(x, qweight, scales, zeros):
"""
input x: (n, m)
qweight: (j, k)
where m == j*8
perform x @ qweight
return y:
"""
if debug:
print('_matmul4bit_v1')
assert qweight.shape[0] * 8 == x.shape[-1]
outshape = x.shape[:-1] + (qweight.shape[1],)
x = x.reshape(-1, x.shape[-1])
y = torch.zeros((x.shape[0], qweight.shape[-1]), dtype=torch.float32, device=x.device)
dtype = x.dtype
x = x.half()
quant_cuda.vecquant4matmul_v1_faster(x, qweight, y, scales, zeros)
y = y.to(dtype)
return y.reshape(outshape)
def _matmul4bit_v2(x, qweight, scales, zeros, g_idx):
"""
input x: (n, m)
qweight: (j, k)
where m == j*8
perform x @ qweight
return y:
"""
if debug:
print('_matmul4bit_v2')
assert qweight.shape[0] * 8 == x.shape[-1]
outshape = x.shape[:-1] + (qweight.shape[1],)
x = x.reshape(-1, x.shape[-1])
y = torch.zeros((x.shape[0], qweight.shape[-1]), dtype=torch.float32, device=x.device)
dtype = x.dtype
x = x.half()
quant_cuda.vecquant4matmul_faster(x, qweight, y, scales, zeros, g_idx, x.shape[-1] // 2)
y = y.to(dtype)
return y.reshape(outshape)
def _matmul4bit_v1_recons(x, qweight, scales, zeros, transpose=False):
if debug:
print('_matmul4bit_v1_recons')
if not transpose:
assert qweight.shape[0] * 8 == x.shape[-1]
else:
assert qweight.shape[1] == x.shape[-1]
buffer = get_buffer(qweight.shape, dtype=scales.dtype, device=qweight.device)
quant_cuda.vecquant4recons_v1(qweight, buffer, scales, zeros)
if not transpose:
output = torch.matmul(x, buffer)
else:
output = torch.matmul(x, buffer.T)
return output
def _matmul4bit_v2_recons(x, qweight, scales, zeros, g_idx, transpose=False):
if debug:
print('_matmul4bit_v2_recons')
if not transpose:
assert qweight.shape[0] * 8 == x.shape[-1]
else:
assert qweight.shape[1] == x.shape[-1]
buffer = get_buffer(qweight.shape, dtype=scales.dtype, device=qweight.device)
quant_cuda.vecquant4recons_v2(qweight, buffer, scales, zeros, g_idx)
if not transpose:
output = torch.matmul(x, buffer)
else:
output = torch.matmul(x, buffer.T)
return output
def matmul4bit(x, qweight, scales, zeros, g_idx=None):
# detect if zeros is int32
if zeros.dtype != torch.int32:
# use v1
if use_new:
if auto_switch:
if np.prod(x.shape[:-1]) > auto_switch_thd:
output = _matmul4bit_v1_recons(x.to(scales.dtype), qweight, scales, zeros)
else:
output = _matmul4bit_v1(x, qweight, scales.float(), zeros.float())
else:
output = _matmul4bit_v1(x, qweight, scales.float(), zeros.float())
else:
if g_idx is None:
g_idx = torch.zeros(qweight.shape[0] * 8, dtype=torch.int32, device=x.device)
# use v2
if use_new:
if auto_switch:
if np.prod(x.shape[:-1]) > auto_switch_thd:
output = _matmul4bit_v2_recons(x.to(scales.dtype), qweight, scales, zeros, g_idx)
else:
output = _matmul4bit_v2(x, qweight, scales.float(), zeros, g_idx)
else:
output = _matmul4bit_v2(x, qweight, scales.float(), zeros, g_idx)
return output
def v2_to_v1(scales, zeros):
"""
Convert zeros in V2 model to V1 model when group_num = 1, for debugging
depreciated
"""
assert zeros.shape[0] == 1
z_mat = torch.zeros((zeros.shape[1], 256), dtype=torch.int, device=zeros.device) + zeros.reshape((-1,1))
z_buffer = torch.zeros((z_mat.shape[0] * 8, z_mat.shape[1]), dtype=torch.float16, device=zeros.device)
z_zeros = torch.zeros(z_mat.shape[1], dtype=torch.float16, device=zeros.device)
z_scales = torch.ones(z_mat.shape[1], dtype=torch.float16, device=zeros.device)
quant_cuda.vecquant4recons_v1(z_mat, z_buffer, z_scales, z_zeros)
z_buffer = z_buffer[:,0]
zeros_recons = z_buffer * scales + scales
return zeros_recons
+57
View File
@@ -0,0 +1,57 @@
My personal repo to convert models from Lora to huggingface/ggml/gptq 4bit so I can run them in normal text-webui and llama.cpp
How do we do this?
1. lora -> hf
- [tloen/alpaca-lora/export_hf_checkpoint.py](https://github.com/tloen/alpaca-lora/blob/main/export_hf_checkpoint.py)
2. hf -> 4bit
- using [GPTQ-for-LLaMa/llama.py](https://github.com/qwopqwop200/GPTQ-for-LLaMa/blob/triton/llama.py)
`CUDA_VISIBLE_DEVICES=0 python llama.py ./llama-hf/llama-7b c4 --wbits 4 --true-sequential --act-order --groupsize 128 --save llama7b-4bit-128g.pt`
3) and to ggml
- [llama.cpp/convert-pth-to-ggml.py](https://github.com/ggerganov/llama.cpp/blob/master/convert-pth-to-ggml.py)
# TODO
- [ ] lora -> hf
- [ ] hf -> 4bit
- [ ] hf -> ggml
# setup env
```sh
conda create -n textgen3 python=3.10.9
conda activate textgen3
mamba install pytorch torchvision torchaudio pytorch-cuda=11.7 cudatoolkit-dev==11.7 cudatoolkit=11.7 -c pytorch -c nvidia -c conda-forge
```
# download models
```sh
# # base models.... FIXME
# wget https://huggingface.co/maderix/llama-65b-4bit/resolve/main/llama30b-4bit.pt ../llama-30b-4bit.pt
# wget https://huggingface.co/maderix/llama-65b-4bit/resolve/main/llama13b-4bit.pt ../llama-13b-4bit.pt
# wget https://huggingface.co/maderix/llama-65b-4bit/resolve/main/llama7b-4bit.pt ../llama-7b-4bit.pt
# cools models:
# - https://huggingface.co/jordiclive/gpt4all-alpaca-oa-codealpaca-lora-13b
# - https://huggingface.co/Black-Engineer/oasst-llama30b-ggml-q4
# - https://huggingface.co/chansung/alpaca-lora-30b
# download loras
python scripts/download-model.py chansung/alpaca-lora-30b
python scripts/download-model.py chansung/alpaca-lora-13b
python scripts/download-model.py tloen/alpaca-lora-7b
```
# convert models
```sh
python scripts/export_hf_checkpoint.py ./models/llama-7b-hf -l loras/tloen_alpaca-lora-7b
```
# Links
- https://github.com/s4rduk4r/alpaca_lora_4bit_readme/blob/main/README.md
@@ -1,144 +0,0 @@
from typing import List, Optional, Tuple
import torch
from torch import nn
import transformers
from transformers.models.llama.modeling_llama import LlamaConfig, LlamaRotaryEmbedding, apply_rotary_pos_emb
from einops import rearrange
from flash_attn.flash_attn_interface import flash_attn_unpadded_qkvpacked_func
from flash_attn.bert_padding import unpad_input, pad_input
class LlamaAttention(nn.Module):
"""Multi-headed attention from 'Attention Is All You Need' paper"""
def __init__(
self,
config: LlamaConfig,
):
super().__init__()
hidden_size = config.hidden_size
num_heads = config.num_attention_heads
self.hidden_size = hidden_size
self.num_heads = num_heads
self.head_dim = self.hidden_size // num_heads
if (self.head_dim * num_heads) != self.hidden_size:
raise ValueError(
f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
f" and `num_heads`: {num_heads}).")
self.q_proj = nn.Linear(
hidden_size,
num_heads * self.head_dim,
bias=False,
)
self.k_proj = nn.Linear(
hidden_size,
num_heads * self.head_dim,
bias=False,
)
self.v_proj = nn.Linear(
hidden_size,
num_heads * self.head_dim,
bias=False,
)
self.o_proj = nn.Linear(
num_heads * self.head_dim,
hidden_size,
bias=False,
)
self.rotary_emb = LlamaRotaryEmbedding(self.head_dim)
def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
return tensor.view(bsz, seq_len, self.num_heads,
self.head_dim).transpose(1, 2).contiguous()
def forward(
self,
hidden_states: torch.Tensor,
past_key_value: Optional[Tuple[torch.Tensor]] = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
output_attentions: bool = False,
use_cache: bool = False,
) -> Tuple[torch.Tensor, Optional[torch.Tensor],
Optional[Tuple[torch.Tensor]]]:
"""Input shape: Batch x Time x Channel
attention_mask: [bsz, q_len]
"""
bsz, q_len, _ = hidden_states.size()
query_states = self.q_proj(hidden_states).view(
bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
key_states = self.k_proj(hidden_states).view(
bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
value_states = self.v_proj(hidden_states).view(
bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
# [bsz, q_len, nh, hd]
# [bsz, nh, q_len, hd]
kv_seq_len = key_states.shape[-2]
if past_key_value is not None:
kv_seq_len += past_key_value[0].shape[-2]
cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
query_states, key_states = apply_rotary_pos_emb(query_states,
key_states,
cos,
sin,
position_ids)
# [bsz, nh, t, hd]
assert not output_attentions, "output_attentions is not supported"
assert not use_cache, "use_cache is not supported"
assert past_key_value is None, "past_key_value is not supported"
# Flash attention codes from
# https://github.com/HazyResearch/flash-attention/blob/main/flash_attn/flash_attention.py
# transform the data into the format required by flash attention
qkv = torch.stack([query_states, key_states, value_states], dim=2) # [bsz, nh, 3, q_len, hd]
qkv = qkv.transpose(1, 3) # [bsz, q_len, 3, nh, hd]
# We have disabled _prepare_decoder_attention_mask in LlamaModel
# the attention_mask should be the same as the key_padding_mask
key_padding_mask = attention_mask
if key_padding_mask is None:
qkv = rearrange(qkv, 'b s ... -> (b s) ...')
max_s = q_len
cu_q_lens = torch.arange(0, (bsz + 1) * q_len, step=q_len, dtype=torch.int32,
device=qkv.device)
output = flash_attn_unpadded_qkvpacked_func(
qkv, cu_q_lens, max_s, 0.0,
softmax_scale=None, causal=True
)
output = rearrange(output, '(b s) ... -> b s ...', b=bsz)
else:
nheads = qkv.shape[-2]
x = rearrange(qkv, 'b s three h d -> b s (three h d)')
x_unpad, indices, cu_q_lens, max_s = unpad_input(x, key_padding_mask)
x_unpad = rearrange(x_unpad, 'nnz (three h d) -> nnz three h d', three=3, h=nheads)
output_unpad = flash_attn_unpadded_qkvpacked_func(
x_unpad, cu_q_lens, max_s, 0.0,
softmax_scale=None, causal=True
)
output = rearrange(pad_input(rearrange(output_unpad, 'nnz h d -> nnz (h d)'),
indices, bsz, q_len),
'b s (h d) -> b s h d', h=nheads)
return self.o_proj(rearrange(output,
'b s h d -> b s (h d)')), None, None
# Copied from transformers.models.bart.modeling_bart.BartDecoder._prepare_decoder_attention_mask
def _prepare_decoder_attention_mask(self, attention_mask, input_shape,
inputs_embeds, past_key_values_length):
# [bsz, seq_len]
return attention_mask
def replace_llama_attn_with_flash_attn():
transformers.models.llama.modeling_llama.LlamaModel._prepare_decoder_attention_mask = _prepare_decoder_attention_mask
transformers.models.llama.modeling_llama.LlamaAttention = LlamaAttention
+5 -4
View File
@@ -1,12 +1,13 @@
torch
# torch
accelerate
bitsandbytes
datasets
sentencepiece
safetensors
flash-attn
# flash-attn
triton
colorama
git+https://github.com/huggingface/transformers.git
git+https://github.com/huggingface/transformers.git@656e869
git+https://github.com/sterlind/GPTQ-for-LLaMa.git@lora_4bit
git+https://github.com/sterlind/peft.git
# git+https://github.com/sterlind/peft.git@085c09d
git+https://github.com/wassname/peft.git
+37
View File
@@ -0,0 +1,37 @@
'''
clones models from Hugging Face to models/model-name.
Example:
python clone-model.py facebook/opt-1.3b
'''
from git import Repo
import argparse
from tqdm.auto import tqdm
from git import RemoteProgress
parser = argparse.ArgumentParser()
parser.add_argument('MODEL', type=str, default=None, help="`tloen/alpaca-lora-7b`")
parser.add_argument('--branch', type=str, default='main', help='Name of the Git branch to download from.')
args = parser.parse_args()
class CloneProgress(RemoteProgress):
"""tqdm progress bar for GitPython"""
def __init__(self):
super().__init__()
self.pbar = tqdm()
def update(self, op_code, cur_count, max_count=None, message=''):
self.pbar.total = max_count
self.pbar.n = cur_count
self.pbar.refresh()
if __name__ == '__main__':
model = args.MODEL
repo = 'https://huggingface.co/' + model
name = model.replace('/', '_')
dest = f'./models/{name}'
print(f'cloning "{repo}" to "{dest}"')
Repo.clone_from(repo, dest, progress=CloneProgress())
+275
View File
@@ -0,0 +1,275 @@
'''
Downloads models from Hugging Face to models/model-name.
Example:
python download-model.py facebook/opt-1.3b
From https://raw.githubusercontent.com/oobabooga/text-generation-webui/main/download-model.py
'''
import argparse
import base64
import datetime
import hashlib
import json
import re
import sys
from pathlib import Path
import requests
import tqdm
from tqdm.contrib.concurrent import thread_map
parser = argparse.ArgumentParser()
parser.add_argument('MODEL', type=str, default=None, nargs='?')
parser.add_argument('--branch', type=str, default='main', help='Name of the Git branch to download from.')
parser.add_argument('--threads', type=int, default=1, help='Number of files to download simultaneously.')
parser.add_argument('--text-only', action='store_true', help='Only download text files (txt/json).')
parser.add_argument('--output', type=str, default=None, help='The folder where the model should be saved.')
parser.add_argument('--clean', action='store_true', help='Does not resume the previous download.')
parser.add_argument('--check', action='store_true', help='Validates the checksums of model files.')
args = parser.parse_args()
def select_model_from_default_options():
models = {
"OPT 6.7B": ("facebook", "opt-6.7b", "main"),
"OPT 2.7B": ("facebook", "opt-2.7b", "main"),
"OPT 1.3B": ("facebook", "opt-1.3b", "main"),
"OPT 350M": ("facebook", "opt-350m", "main"),
"GALACTICA 6.7B": ("facebook", "galactica-6.7b", "main"),
"GALACTICA 1.3B": ("facebook", "galactica-1.3b", "main"),
"GALACTICA 125M": ("facebook", "galactica-125m", "main"),
"Pythia-6.9B-deduped": ("EleutherAI", "pythia-6.9b-deduped", "main"),
"Pythia-2.8B-deduped": ("EleutherAI", "pythia-2.8b-deduped", "main"),
"Pythia-1.4B-deduped": ("EleutherAI", "pythia-1.4b-deduped", "main"),
"Pythia-410M-deduped": ("EleutherAI", "pythia-410m-deduped", "main"),
}
choices = {}
print("Select the model that you want to download:\n")
for i, name in enumerate(models):
char = chr(ord('A') + i)
choices[char] = name
print(f"{char}) {name}")
char = chr(ord('A') + len(models))
print(f"{char}) None of the above")
print()
print("Input> ", end='')
choice = input()[0].strip().upper()
if choice == char:
print("""\nThen type the name of your desired Hugging Face model in the format organization/name.
Examples:
facebook/opt-1.3b
EleutherAI/pythia-1.4b-deduped
""")
print("Input> ", end='')
model = input()
branch = "main"
else:
arr = models[choices[choice]]
model = f"{arr[0]}/{arr[1]}"
branch = arr[2]
return model, branch
def sanitize_model_and_branch_names(model, branch):
if model[-1] == '/':
model = model[:-1]
if branch is None:
branch = "main"
else:
pattern = re.compile(r"^[a-zA-Z0-9._-]+$")
if not pattern.match(branch):
raise ValueError("Invalid branch name. Only alphanumeric characters, period, underscore and dash are allowed.")
return model, branch
def get_download_links_from_huggingface(model, branch, text_only=False):
base = "https://huggingface.co"
page = f"/api/models/{model}/tree/{branch}?cursor="
cursor = b""
links = []
sha256 = []
classifications = []
has_pytorch = False
has_pt = False
has_ggml = False
has_safetensors = False
is_lora = False
while True:
content = requests.get(f"{base}{page}{cursor.decode()}").content
dict = json.loads(content)
if len(dict) == 0:
break
for i in range(len(dict)):
if 'error' in dict:
print("you might need to run `huggingface-cli login`")
raise Exception(dict['error'])
fname = dict[i]['path']
if not is_lora and fname.endswith(('adapter_config.json', 'adapter_model.bin')):
is_lora = True
is_pytorch = re.match("(pytorch|adapter)_model.*\.bin", fname)
is_safetensors = re.match(".*\.safetensors", fname)
is_pt = re.match(".*\.pt", fname)
is_ggml = re.match("ggml.*\.bin", fname)
is_tokenizer = re.match("tokenizer.*\.model", fname)
is_text = re.match(".*\.(txt|json|py|md)", fname) or is_tokenizer
if any((is_pytorch, is_safetensors, is_pt, is_ggml, is_tokenizer, is_text)):
if 'lfs' in dict[i]:
sha256.append([fname, dict[i]['lfs']['oid']])
if is_text:
links.append(f"https://huggingface.co/{model}/resolve/{branch}/{fname}")
classifications.append('text')
continue
if not text_only:
links.append(f"https://huggingface.co/{model}/resolve/{branch}/{fname}")
if is_safetensors:
has_safetensors = True
classifications.append('safetensors')
elif is_pytorch:
has_pytorch = True
classifications.append('pytorch')
elif is_pt:
has_pt = True
classifications.append('pt')
elif is_ggml:
has_ggml = True
classifications.append('ggml')
cursor = base64.b64encode(f'{{"file_name":"{dict[-1]["path"]}"}}'.encode()) + b':50'
cursor = base64.b64encode(cursor)
cursor = cursor.replace(b'=', b'%3D')
# If both pytorch and safetensors are available, download safetensors only
if (has_pytorch or has_pt) and has_safetensors:
for i in range(len(classifications) - 1, -1, -1):
if classifications[i] in ['pytorch', 'pt']:
links.pop(i)
return links, sha256, is_lora
def get_output_folder(model, branch, is_lora, base_folder=None):
if base_folder is None:
base_folder = 'models' if not is_lora else 'loras'
output_folder = f"{'_'.join(model.split('/')[-2:])}"
if branch != 'main':
output_folder += f'_{branch}'
output_folder = Path(base_folder) / output_folder
return output_folder
def get_single_file(url, output_folder, start_from_scratch=False):
filename = Path(url.rsplit('/', 1)[1])
output_path = output_folder / filename
if output_path.exists() and not start_from_scratch:
# Check if the file has already been downloaded completely
r = requests.get(url, stream=True)
total_size = int(r.headers.get('content-length', 0))
if output_path.stat().st_size >= total_size:
return
# Otherwise, resume the download from where it left off
headers = {'Range': f'bytes={output_path.stat().st_size}-'}
mode = 'ab'
else:
headers = {}
mode = 'wb'
r = requests.get(url, stream=True, headers=headers)
with open(output_path, mode) as f:
total_size = int(r.headers.get('content-length', 0))
block_size = 1024
with tqdm.tqdm(total=total_size, unit='iB', unit_scale=True, bar_format='{l_bar}{bar}| {n_fmt:6}/{total_fmt:6} {rate_fmt:6}') as t:
for data in r.iter_content(block_size):
t.update(len(data))
f.write(data)
def start_download_threads(file_list, output_folder, start_from_scratch=False, threads=1):
thread_map(lambda url: get_single_file(url, output_folder, start_from_scratch=start_from_scratch), file_list, max_workers=threads, disable=True)
def download_model_files(model, branch, links, sha256, output_folder, start_from_scratch=False, threads=1):
# Creating the folder and writing the metadata
if not output_folder.exists():
output_folder.mkdir()
with open(output_folder / 'huggingface-metadata.txt', 'w') as f:
f.write(f'url: https://huggingface.co/{model}\n')
f.write(f'branch: {branch}\n')
f.write(f'download date: {str(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))}\n')
sha256_str = ''
for i in range(len(sha256)):
sha256_str += f' {sha256[i][1]} {sha256[i][0]}\n'
if sha256_str != '':
f.write(f'sha256sum:\n{sha256_str}')
# Downloading the files
print(f"Downloading the model to {output_folder}")
start_download_threads(links, output_folder, start_from_scratch=start_from_scratch, threads=threads)
def check_model_files(model, branch, links, sha256, output_folder):
# Validate the checksums
validated = True
for i in range(len(sha256)):
fpath = (output_folder / sha256[i][0])
if not fpath.exists():
print(f"The following file is missing: {fpath}")
validated = False
continue
with open(output_folder / sha256[i][0], "rb") as f:
bytes = f.read()
file_hash = hashlib.sha256(bytes).hexdigest()
if file_hash != sha256[i][1]:
print(f'Checksum failed: {sha256[i][0]} {sha256[i][1]}')
validated = False
else:
print(f'Checksum validated: {sha256[i][0]} {sha256[i][1]}')
if validated:
print('[+] Validated checksums of all model files!')
else:
print('[-] Invalid checksums. Rerun download-model.py with the --clean flag.')
if __name__ == '__main__':
branch = args.branch
model = args.MODEL
if model is None:
model, branch = select_model_from_default_options()
# Cleaning up the model/branch names
try:
model, branch = sanitize_model_and_branch_names(model, branch)
except ValueError as err_branch:
print(f"Error: {err_branch}")
sys.exit()
# Getting the download links from Hugging Face
links, sha256, is_lora = get_download_links_from_huggingface(model, branch, text_only=args.text_only)
# Getting the output folder
output_folder = get_output_folder(model, branch, is_lora, base_folder=args.output)
if args.check:
# Check previously downloaded files
check_model_files(model, branch, links, sha256, output_folder)
else:
# Download files
download_model_files(model, branch, links, sha256, output_folder, threads=args.threads)
+81
View File
@@ -0,0 +1,81 @@
"""
From https://raw.githubusercontent.com/tloen/alpaca-lora/main/export_hf_checkpoint.py
"""
import os
from pathlib import Path
import argparse
import torch
import transformers
from peft import PeftModel
from transformers import LlamaForCausalLM, LlamaTokenizer # noqa: F402
def main(BASE_MODEL, LORA_MODEL, output_path=None):
if output_path is None:
output_path = 'models/' + LORA_MODEL.split('/')[-1] + '-delorified'
# BASE_MODEL = os.environ.get("BASE_MODEL", None)
# assert (
# BASE_MODEL
# ), "Please specify a value for BASE_MODEL environment variable, e.g. `export BASE_MODEL=huggyllama/llama-7b`" # noqa: E501
# LORA_MODEL = os.environ.get("BASE_MODEL", None)
# assert (
# LORA_MODEL
# ), "Please specify a value for LORA_MODEL environment variable, e.g. `export BASE_MODEL=tloen/alpaca-lora-7b`" # noqa: E501
tokenizer = LlamaTokenizer.from_pretrained(BASE_MODEL)
base_model = LlamaForCausalLM.from_pretrained(
BASE_MODEL,
load_in_8bit=False,
torch_dtype=torch.float16,
device_map={"": "cpu"},
)
first_weight = base_model.model.layers[0].self_attn.q_proj.weight
first_weight_old = first_weight.clone()
lora_model = PeftModel.from_pretrained(
base_model,
LORA_MODEL,
device_map={"": "cpu"},
torch_dtype=torch.float16,
)
lora_weight = lora_model.base_model.model.model.layers[
0
].self_attn.q_proj.weight
assert torch.allclose(first_weight_old, first_weight)
# merge weights - new merging method from peft
lora_model = lora_model.merge_and_unload()
lora_model.train(False)
# did we do anything?
assert not torch.allclose(first_weight_old, first_weight)
lora_model_sd = lora_model.state_dict()
deloreanized_sd = {
k.replace("base_model.model.", ""): v
for k, v in lora_model_sd.items()
if "lora" not in k
}
LlamaForCausalLM.save_pretrained(
base_model, output_path, state_dict=deloreanized_sd, max_shard_size="400MB"
)
print(f'output {output_path}')
if __name__=="__main__":
parser = argparse.ArgumentParser()
parser.add_argument('model', type=str)
parser.add_argument('-l', '--lora', type=str, default='main', help='Lora repo or path e.g. `tloen/alpaca-lora-7b`')
parser.add_argument('-o', '--output', type=Path, default=None)
"e.g. ./hf_ckpt. default will be lora name"
args = parser.parse_args()
main(args.model, args.lora, args.output)
-273
View File
@@ -1,273 +0,0 @@
import torch
from abc import ABC, abstractmethod
from typing import Dict, Any
from datasets import load_dataset, Dataset
from torch.utils.data import DataLoader
from transformers import DefaultDataCollator
import os
# Abstract train data loader
class ATrainData(ABC):
"""
"""
@abstractmethod
def __init__(self, dataset: str, val_set_size: int, tokenizer, cutoff_len: int) -> None:
"""
Args:
dataset (str): Path to dataset
val_set_size (int) : Size of validation set
tokenizer (_type_): Tokenizer
"""
self.tokenizer = tokenizer
self.dataset = dataset
self.val_set_size = val_set_size
self.cutoff_len = cutoff_len
self.train_data = None
self.val_data = None
@abstractmethod
def tokenize(self, prompt: str) -> Dict[str, Any]:
"""Tokenization method
Args:
prompt (str): Prompt string from dataset
Returns:
Dict[str, Any]: token
"""
pass
@abstractmethod
def prepare_data(self) -> None:
"""Loads dataset from file and prepares train_data property for trainer
"""
pass
# LLaMA txt train data loader
class TrainTxt(ATrainData):
def __init__(self, dataset: str, val_set_size: int, tokenizer, cutoff_len):
super().__init__(dataset, val_set_size, tokenizer, cutoff_len) # TODO: Validation size isn't used
self.cutoff_len = cutoff_len
self.exceed_count = 0
def tokenize(self, prompt: str, use_eos_token=True, **kwargs) -> Dict[str, Any]:
# there's probably a way to do this with the tokenizer settings
# but again, gotta move fast
if use_eos_token:
result = self.tokenizer(
prompt + self.tokenizer.eos_token,
truncation=True,
max_length=self.cutoff_len,
padding=False,
)
d = {
"input_ids": result["input_ids"],
"attention_mask": result["attention_mask"],
}
if (
d["input_ids"][-1] != self.tokenizer.eos_token_id
and len(d["input_ids"]) < self.cutoff_len
):
d["input_ids"].append(self.tokenizer.eos_token_id)
d["attention_mask"].append(1)
else:
result = self.tokenizer(
prompt,
truncation=True,
max_length=self.cutoff_len + 1,
padding="max_length",
)
d = {
"input_ids": result["input_ids"][:-1],
"attention_mask": result["attention_mask"][:-1],
}
if sum(d['attention_mask']) >= self.cutoff_len:
self.exceed_count += 1
return d
@classmethod
def format_new_rows(cls, rows, thd=128):
r_b = ''
new_rows = []
for row in rows:
if len(r_b) == 0:
r_b += row
else:
r_b += '\n' + row
if len(r_b) > thd:
new_rows.append(r_b)
r_b = ''
if len(r_b) > thd:
new_rows.append(r_b)
r_b = ''
return new_rows
def prepare_data(self, thd=-1, use_eos_token=True, **kwargs):
if os.path.isdir(self.dataset):
rows = []
for filename in os.listdir(self.dataset):
with open(self.dataset + filename, 'r', encoding='utf8') as file:
txt = file.read()
txt = txt.replace('\r\n', '\n').replace('\u3000', ' ')
rows += [r for r in txt.split('\n') if r != '']
else:
with open(self.dataset, 'r', encoding='utf8') as file:
txt = file.read()
txt = txt.replace('\r\n', '\n')
rows = [r for r in txt.split('\n') if r != '']
if thd != -1:
rows = self.format_new_rows(rows, thd=thd)
data = Dataset.from_dict({"input": rows})
data = data.shuffle().map(lambda x: self.tokenize(x["input"], use_eos_token=use_eos_token))
print('Train Data: {:.2f}%'.format(self.exceed_count / len(data) * 100), 'outliers')
self.train_data = data
# Stanford Alpaca-like Data
class TrainSAD(ATrainData):
def __init__(self, dataset: str, val_set_size: int, tokenizer, cutoff_len) -> None:
super().__init__(dataset, val_set_size, tokenizer, cutoff_len)
def tokenize(self, prompt: str, use_eos_token=True, **kwargs) -> Dict[str, Any]:
# there's probably a way to do this with the tokenizer settings
# but again, gotta move fast
if use_eos_token:
result = self.tokenizer(
prompt + self.tokenizer.eos_token,
truncation=True,
max_length=self.cutoff_len,
padding=False,
)
if (
result["input_ids"][-1] != self.tokenizer.eos_token_id
and len(result["input_ids"]) < self.cutoff_len
):
result["input_ids"].append(self.tokenizer.eos_token_id)
result["attention_mask"].append(1)
return result
else:
result = self.tokenizer(
prompt,
truncation=True,
max_length=self.cutoff_len + 1,
padding="max_length",
)
return {
"input_ids": result["input_ids"][:-1],
"attention_mask": result["attention_mask"][:-1],
}
def prepare_data(self, use_eos_token=True, **kwargs) -> None:
data = load_dataset("json", data_files=self.dataset)
if self.val_set_size > 0:
train_val = data["train"].train_test_split(
test_size=self.val_set_size, shuffle=True, seed=42 # ! Seed = 42 (?)
)
self.train_data = train_val["train"].shuffle().map(lambda x: self.generate_and_tokenize_prompt(x, use_eos_token=use_eos_token))
self.val_data = train_val["test"].shuffle().map(lambda x: self.generate_and_tokenize_prompt(x, use_eos_token=use_eos_token))
else:
self.train_data = data["train"].shuffle().map(lambda x: self.generate_and_tokenize_prompt(x, use_eos_token=use_eos_token))
self.val_data = None
# Auxiliary methods
def generate_prompt(self, data_point, **kwargs):
return "{0}\n\n{1}\n{2}\n\n{3}\n{4}\n\n{5}\n{6}".format(
"Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.",
"### Instruction:",
data_point["instruction"],
"### Input:",
data_point["input"],
"### Response:",
data_point["output"]
)
def generate_and_tokenize_prompt(self, data_point, **kwargs):
prompt = self.generate_prompt(data_point, **kwargs)
return self.tokenize(prompt, **kwargs)
# GPT4All-like Data
class TrainGPT4All(ATrainData):
def __init__(self, dataset: str, val_set_size: int, tokenizer, cutoff_len) -> None:
super().__init__(dataset, val_set_size, tokenizer, cutoff_len)
def tokenize(self, prompt: str, use_eos_token=True, **kwargs) -> Dict[str, Any]:
pass
def tokenize_inputs(self, examples):
max_length = self.cutoff_len
input_ids = torch.full((len(examples["prompt"]), max_length), self.tokenizer.pad_token_id)
# ignore bos
newline_tokens = self.tokenizer("\n", return_tensors="pt")["input_ids"][0, 1:]
out = {"labels": [], "attention_mask": []}
for i, (prompt, response) in enumerate(zip(examples["prompt"], examples["response"])):
input_tokens = self.tokenizer(prompt, truncation=True, max_length=max_length // 2, return_tensors="pt")["input_ids"].squeeze()
if input_tokens.dim() == 0:
input_tokens = input_tokens.unsqueeze(0)
input_len = len(input_tokens)
# plus one since we remove bos from response
# but we subtract one since we want to add eos token
remaining_tokens = max_length - input_len - len(newline_tokens) + 1
# remove bos
target_tokens = self.tokenizer(response, truncation=True, max_length=remaining_tokens, return_tensors="pt")["input_ids"].squeeze()[1:]
input_ids[i, :input_len] = input_tokens
# add newline between prompt and response
newline_plus_inputs = input_len + len(newline_tokens)
input_ids[i, input_len: newline_plus_inputs] = newline_tokens
# add target tokens, remove bos
input_ids[i, newline_plus_inputs: newline_plus_inputs + len(target_tokens)] = target_tokens
# add eos token, enforce stopping if we don't truncate
# we don't want long code to stop generating if truncated during training
if newline_plus_inputs + len(target_tokens) < max_length:
input_ids[i, newline_plus_inputs + len(target_tokens)] = self.tokenizer.eos_token_id
labels = input_ids[i].clone()
labels[: newline_plus_inputs] = -100
labels[labels == self.tokenizer.pad_token_id] = -100
# to debug this, can set all values == -100 to the pad token, then assert that tokenizer.decode(labels, skip_special_tokens=True).strip() == response
attention_mask = input_ids[i].ne(self.tokenizer.pad_token_id).int()
out["labels"].append(labels)
out["attention_mask"].append(attention_mask)
out["input_ids"] = input_ids
out = {k: torch.stack(v) if isinstance(v, list) else v for k, v in out.items()}
return out
def prepare_data(self, **kwargs) -> None:
dataset = load_dataset("json", data_files=self.dataset)
self.val_data = None
if self.val_set_size > 0:
dataset = dataset["train"].train_test_split(
test_size=self.val_set_size, shuffle=True, seed=42 # ! Seed = 42 (?)
)
train_dataset, val_dataset = dataset["train"], dataset["test"]
# tokenize inputs and return labels and attention mask
val_dataset = val_dataset.map(
lambda ele: self.tokenize_inputs(ele),
batched=True,
remove_columns=["source", "prompt"],
)
self.val_data = val_dataset.with_format("torch")
else:
train_dataset = dataset["train"]
train_dataset = train_dataset.map(
lambda ele: self.tokenize_inputs(ele),
batched=True,
remove_columns=["source", "prompt"],
)
self.train_data = train_dataset.with_format("torch")
-246
View File
@@ -1,246 +0,0 @@
import triton
import triton.language as tl
import torch
import custom_autotune
# code based https://github.com/fpgaminer/GPTQ-triton
@custom_autotune.autotune(
configs=[
triton.Config({'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
# These provided a benefit on a 3090
triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
triton.Config({'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
triton.Config({'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
triton.Config({'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 128, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
],
key=['M', 'N'],
nearest_power_of_two=True,
)
@triton.jit
def matmul_248_kernel(a_ptr, b_ptr, c_ptr,
scales_ptr, zeros_ptr, g_ptr,
M, N, K, bits, maxq,
stride_am, stride_ak,
stride_bk, stride_bn,
stride_cm, stride_cn,
stride_scales, stride_zeros,
BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
GROUP_SIZE_M: tl.constexpr):
"""
Compute the matrix multiplication C = A x B.
A is of shape (M, K) float16
B is of shape (K//8, N) int32
C is of shape (M, N) float16
scales is of shape (G, N) float16
zeros is of shape (G, N) float16
g_ptr is of shape (K) int32
"""
infearure_per_bits = 32 // bits
pid = tl.program_id(axis=0)
num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
num_pid_k = tl.cdiv(K, BLOCK_SIZE_K)
num_pid_in_group = GROUP_SIZE_M * num_pid_n
group_id = pid // num_pid_in_group
first_pid_m = group_id * GROUP_SIZE_M
group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
pid_m = first_pid_m + (pid % group_size_m)
pid_n = (pid % num_pid_in_group) // group_size_m
offs_am = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
offs_bn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
offs_k = tl.arange(0, BLOCK_SIZE_K)
a_ptrs = a_ptr + (offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak) # (BLOCK_SIZE_M, BLOCK_SIZE_K)
a_mask = (offs_am[:, None] < M)
# b_ptrs is set up such that it repeats elements along the K axis 8 times
b_ptrs = b_ptr + ((offs_k[:, None] // infearure_per_bits) * stride_bk + offs_bn[None, :] * stride_bn) # (BLOCK_SIZE_K, BLOCK_SIZE_N)
g_ptrs = g_ptr + offs_k
# shifter is used to extract the N bits of each element in the 32-bit word from B
scales_ptrs = scales_ptr + offs_bn[None, :]
zeros_ptrs = zeros_ptr + (offs_bn[None, :] // infearure_per_bits)
shifter = (offs_k % infearure_per_bits) * bits
zeros_shifter = (offs_bn % infearure_per_bits) * bits
accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
for k in range(0, num_pid_k):
g_idx = tl.load(g_ptrs)
# Fetch scales and zeros; these are per-outfeature and thus reused in the inner loop
scales = tl.load(scales_ptrs + g_idx[:, None] * stride_scales) # (BLOCK_SIZE_K, BLOCK_SIZE_N,)
zeros = tl.load(zeros_ptrs + g_idx[:, None] * stride_zeros) # (BLOCK_SIZE_K, BLOCK_SIZE_N,)
zeros = (zeros >> zeros_shifter[None, :]) & maxq
zeros = (zeros + 1)
a = tl.load(a_ptrs, mask=a_mask, other=0.0) # (BLOCK_SIZE_M, BLOCK_SIZE_K)
b = tl.load(b_ptrs) # (BLOCK_SIZE_K, BLOCK_SIZE_N), but repeated
# Now we need to unpack b (which is N-bit values) into 32-bit values
b = (b >> shifter[:, None]) & maxq # Extract the N-bit values
b = (b - zeros) * scales # Scale and shift
# ! Convert to fp16
b = b.to(tl.float16)
a = a.to(tl.float16)
accumulator += tl.dot(a, b)
a_ptrs += BLOCK_SIZE_K
b_ptrs += (BLOCK_SIZE_K // infearure_per_bits) * stride_bk
g_ptrs += BLOCK_SIZE_K
c = accumulator.to(tl.float16)
c_ptrs = c_ptr + stride_cm * offs_am[:, None] + stride_cn * offs_bn[None, :]
c_mask = (offs_am[:, None] < M) & (offs_bn[None, :] < N)
tl.store(c_ptrs, c, mask=c_mask)
# code based https://github.com/fpgaminer/GPTQ-triton
@custom_autotune.autotune(
configs=[
triton.Config({'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_K': 64, 'BLOCK_SIZE_N': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_K': 256, 'BLOCK_SIZE_N': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_K': 128, 'BLOCK_SIZE_N': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_K': 64, 'BLOCK_SIZE_N': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_K': 128, 'BLOCK_SIZE_N': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_K': 32, 'BLOCK_SIZE_N': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
# These provided a benefit on a 3090
triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_K': 64, 'BLOCK_SIZE_N': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
triton.Config({'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_K': 64, 'BLOCK_SIZE_N': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_K': 32, 'BLOCK_SIZE_N': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
triton.Config({'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_K': 64, 'BLOCK_SIZE_N': 64, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_K': 64, 'BLOCK_SIZE_N': 64, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_K': 32, 'BLOCK_SIZE_N': 64, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
triton.Config({'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_K': 64, 'BLOCK_SIZE_N': 128, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
],
key=['M', 'K'],
nearest_power_of_two=True,
)
@triton.jit
def trans_matmul_248_kernel(a_ptr, b_ptr, c_ptr,
scales_ptr, zeros_ptr, g_ptr,
M, N, K, bits, maxq,
stride_am, stride_ak,
stride_bk, stride_bn,
stride_cm, stride_cn,
stride_scales, stride_zeros,
BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
GROUP_SIZE_M: tl.constexpr):
"""
Compute the matrix multiplication C = A x B.
A is of shape (M, N) float16
B is of shape (K//8, N) int32
C is of shape (M, K) float16
scales is of shape (G, N) float16
zeros is of shape (G, N) float16
g_ptr is of shape (K) int32
"""
infearure_per_bits = 32 // bits
pid = tl.program_id(axis=0)
num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
num_pid_k = tl.cdiv(K, BLOCK_SIZE_K)
num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
num_pid_in_group = GROUP_SIZE_M * num_pid_k
group_id = pid // num_pid_in_group
first_pid_m = group_id * GROUP_SIZE_M
group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
pid_m = first_pid_m + (pid % group_size_m)
pid_k = (pid % num_pid_in_group) // group_size_m
offs_am = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
offs_bk = pid_k * BLOCK_SIZE_K + tl.arange(0, BLOCK_SIZE_K)
offs_n = tl.arange(0, BLOCK_SIZE_N)
a_ptrs = a_ptr + (offs_am[:, None] * stride_am + offs_n[None, :] * stride_ak) # (BLOCK_SIZE_M, BLOCK_SIZE_N)
a_mask = (offs_am[:, None] < M)
# b_ptrs is set up such that it repeats elements along the K axis 8 times
b_ptrs = b_ptr + ((offs_bk[:, None] // infearure_per_bits) * stride_bk + offs_n[None, :] * stride_bn) # (BLOCK_SIZE_K, BLOCK_SIZE_N)
g_ptrs = g_ptr + offs_bk
g_idx = tl.load(g_ptrs)
# shifter is used to extract the N bits of each element in the 32-bit word from B
scales_ptrs = scales_ptr + offs_n[None, :] + g_idx[:, None] * stride_scales
zeros_ptrs = zeros_ptr + (offs_n[None, :] // infearure_per_bits) + g_idx[:, None] * stride_zeros
shifter = (offs_bk % infearure_per_bits) * bits
zeros_shifter = (offs_n % infearure_per_bits) * bits
accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_K), dtype=tl.float32)
for k in range(0, num_pid_n):
# Fetch scales and zeros; these are per-outfeature and thus reused in the inner loop
scales = tl.load(scales_ptrs) # (BLOCK_SIZE_K, BLOCK_SIZE_N,)
zeros = tl.load(zeros_ptrs) # (BLOCK_SIZE_K, BLOCK_SIZE_N,)
zeros = (zeros >> zeros_shifter[None, :]) & maxq
zeros = (zeros + 1)
a = tl.load(a_ptrs, mask=a_mask, other=0.) # (BLOCK_SIZE_M, BLOCK_SIZE_N)
b = tl.load(b_ptrs) # (BLOCK_SIZE_K, BLOCK_SIZE_N), but repeated
# Now we need to unpack b (which is N-bit values) into 32-bit values
b = (b >> shifter[:, None]) & maxq # Extract the N-bit values
b = (b - zeros) * scales # Scale and shift
b = tl.trans(b)
# ! Convert to fp16
b = b.to(tl.float16)
a = a.to(tl.float16)
accumulator += tl.dot(a, b)
a_ptrs += BLOCK_SIZE_N
b_ptrs += BLOCK_SIZE_N
scales_ptrs += BLOCK_SIZE_N
zeros_ptrs += (BLOCK_SIZE_N // infearure_per_bits)
c = accumulator.to(tl.float16)
c_ptrs = c_ptr + stride_cm * offs_am[:, None] + stride_cn * offs_bk[None, :]
c_mask = (offs_am[:, None] < M) & (offs_bk[None, :] < K)
tl.store(c_ptrs, c, mask=c_mask)
def triton_matmul(input, qweight, scales, qzeros, g_idx, bits, maxq):
assert input.shape[-1] == qweight.shape[0] * 32 // bits
outshape = input.shape[:-1] + (qweight.shape[1],)
input = input.reshape(-1, input.shape[-1])
output = torch.empty((input.shape[0], qweight.shape[1]), device=scales.device, dtype=torch.float16)
grid = lambda META: (triton.cdiv(input.shape[0], META['BLOCK_SIZE_M']) * triton.cdiv(qweight.shape[1], META['BLOCK_SIZE_N']),)
matmul_248_kernel[grid](input, qweight, output,
scales, qzeros, g_idx,
input.shape[0], qweight.shape[1], input.shape[1], bits, maxq,
input.stride(0), input.stride(1),
qweight.stride(0), qweight.stride(1),
output.stride(0), output.stride(1),
scales.stride(0), qzeros.stride(0))
output = output.reshape(outshape)
return output
def triton_matmul_transpose(input, qweight, scales, qzeros, g_idx, bits, maxq):
assert input.shape[-1] == qweight.shape[1]
out_dim = qweight.shape[0] * 32 // bits
outshape = input.shape[:-1] + (out_dim,)
input = input.reshape(-1, input.shape[-1])
output_shape_mid = (input.shape[0], out_dim)
output = torch.empty((output_shape_mid[0], output_shape_mid[1]), device=scales.device, dtype=torch.float16)
grid = lambda META: (triton.cdiv(input.shape[0], META['BLOCK_SIZE_M']) * triton.cdiv(output_shape_mid[1], META['BLOCK_SIZE_K']),)
trans_matmul_248_kernel[grid](input, qweight, output,
scales, qzeros, g_idx,
input.shape[0], qweight.shape[1], output_shape_mid[1], bits, maxq,
input.stride(0), input.stride(1),
qweight.stride(0), qweight.stride(1),
output.stride(0), output.stride(1),
scales.stride(0), qzeros.stride(0))
output = output.reshape(outshape)
return output