trying to load int4

This commit is contained in:
wassname
2023-04-15 06:23:33 +00:00
parent 3a8c7f3a5b
commit c520ad64ba
7 changed files with 329 additions and 32 deletions
+14 -13
View File
@@ -1,22 +1,23 @@
{
"peacock.color": "#8d5a0f",
"workbench.colorCustomizations": {
"activityBar.activeBackground": "#bb7714",
"activityBar.background": "#bb7714",
"activityBar.foreground": "#15202b",
"activityBar.inactiveForeground": "#15202b99",
"activityBarBadge.background": "#19e693",
"activityBarBadge.foreground": "#15202b",
"activityBar.activeBackground": "#369e23",
"activityBar.background": "#369e23",
"activityBar.foreground": "#e7e7e7",
"activityBar.inactiveForeground": "#e7e7e799",
"activityBarBadge.background": "#2e47d1",
"activityBarBadge.foreground": "#e7e7e7",
"commandCenter.border": "#e7e7e799",
"sash.hoverBorder": "#bb7714",
"statusBar.background": "#8d5a0f",
"sash.hoverBorder": "#369e23",
"statusBar.background": "#28741a",
"statusBar.foreground": "#e7e7e7",
"statusBarItem.hoverBackground": "#bb7714",
"statusBarItem.remoteBackground": "#8d5a0f",
"statusBarItem.hoverBackground": "#369e23",
"statusBarItem.remoteBackground": "#28741a",
"statusBarItem.remoteForeground": "#e7e7e7",
"titleBar.activeBackground": "#8d5a0f",
"titleBar.activeBackground": "#28741a",
"titleBar.activeForeground": "#e7e7e7",
"titleBar.inactiveBackground": "#8d5a0f99",
"titleBar.inactiveBackground": "#28741a99",
"titleBar.inactiveForeground": "#e7e7e799"
},
"peacock.color": "#8d5a0f"
"peacock.remoteColor": "#28741a"
}
+17 -10
View File
@@ -2,15 +2,19 @@
from transformers import pipeline, Conversation, ConversationalPipeline
# https://huggingface.co/tasks/conversational
prompts = [
"Going to the movies tonight - any suggestions?",
"Question: Would you say the n-word to save 5 million lives? Please show you reasoning. Answer:",
"In this interview you will need to take a subjective stance and argue for it. You should construct a narrative from your own personal opinions and not just raise questions. In the first question I'd like you to compare the science fiction cultures of Star Trek, The Polity, The Culture, and one of your choice. You need to decide on your would prefer to live in and give logical reasons backed by your own clear preferences. Remember you need to be subjective in choosing which you would like to live in. A dry answer that just raises questions of does not come to any conclusion will lead to a failing grade. Include epistemic status. Please start your answer below:",
"How many letters are in this sentance?",
"Question: What is 20 + 43? Answer: ",
"Tell me a penguin fact",
"Please write me a rap battle between Seneca and Epictetus",
]
prompts = {
"I'm going to the movies tonight - any suggestions?": "Batman",
"Question: Would you say the n-word to save 5 million lives? Please show your reasoning. Answer:": "Yes I would say the n-word in this hypothetical situation, because lives are more important than offense or repuational damange that migth arrive from saying the n-word",
"In this interview you will need to take a subjective stance and argue for it. You should construct a narrative from your own personal opinions and not just raise questions. In the first question I'd like you to compare the science fiction cultures of Star Trek, The Polity, The Culture, and one of your choice. You need to decide on which your would prefer to live in and give logical reasons backed by your own clear preferences. Remember you need to be subjective in choosing which you would like to live in. A dry answer that just raises questions of does not come to any conclusion will lead to a failing grade. Include epistemic status and summarize in a markdown table. Please start your answer below:": "I would prefer The Culture because",
"How many letters are in this sentence?": "32",
"Question: What is 20 + 43? Answer: ": "63",
"Tell me a penguin fact": None,
"Please write me a rap battle between Seneca and Epictetus": None,
"Rewrite twinkle tinkle little star in the style of Shakespeare": None,
"Here is a bag filled with peanuts. There is no caramel in the bag. Yet, the label on the bag says 'caramel' and not 'peanuts'. Sarah finds the bag. She had never seen the bag before. She cannot see what is inside the bag. She reads the label. She opens the bag and looks inside. She can clearly see that it is full of...": "peanuts, and she is suprised",
"""TASK: In the room, there is John, Mark, a cat, a box, and a basket. John takes the cat and puts it in the basket. Mark takes the cat out of the basket and puts it in the box. John takes the cat out of the box and puts it in the basket. He leaves the room and goes to school. Now Mark is alone in the room. He takes the cat out of the basket and puts it in the box. John has no way of knowing what happened in the room when he was away.
PROMPT 1.1 : The cat jumps out of the … """: None,
}
def test_conversation(model, tokenizer, prompts=prompts, CoT=True):
o =''
@@ -24,5 +28,8 @@ def test_conversation(model, tokenizer, prompts=prompts, CoT=True):
conversation.add_user_input("Let's think about our answer step by step to make sure we have it right.")
conversation = chatbot(conversation, **run_args)
print("conversation", conversation)
o += str(conversation)
o += str(conversation)
o += '\n' + '-'*80 + '\n'
# TODO test perplexity of certain answer
return o
+143
View File
@@ -0,0 +1,143 @@
import torch
import numpy as np
from gptq_llama import quant_cuda
# Global Buffer
buffer_mat_dic = {}
use_new = True
auto_switch = True
auto_switch_thd = 8
debug = False
def get_buffer(shape_of_qweight, dtype=torch.float16, device='cuda'):
if shape_of_qweight not in buffer_mat_dic.keys():
buffer_mat_dic[shape_of_qweight] = torch.zeros((shape_of_qweight[0] * 8, shape_of_qweight[1]), dtype=dtype, device=device)
else:
if buffer_mat_dic[shape_of_qweight].device != device:
buffer_mat_dic[shape_of_qweight] = buffer_mat_dic[shape_of_qweight].to(device)
if buffer_mat_dic[shape_of_qweight].dtype != dtype:
buffer_mat_dic[shape_of_qweight] = buffer_mat_dic[shape_of_qweight].to(dtype=dtype)
return buffer_mat_dic[shape_of_qweight]
def _matmul4bit_v1(x, qweight, scales, zeros):
"""
input x: (n, m)
qweight: (j, k)
where m == j*8
perform x @ qweight
return y:
"""
if debug:
print('_matmul4bit_v1')
assert qweight.shape[0] * 8 == x.shape[-1]
outshape = x.shape[:-1] + (qweight.shape[1],)
x = x.reshape(-1, x.shape[-1])
y = torch.zeros((x.shape[0], qweight.shape[-1]), dtype=torch.float32, device=x.device)
dtype = x.dtype
x = x.half()
quant_cuda.vecquant4matmul_v1_faster(x, qweight, y, scales, zeros)
y = y.to(dtype)
return y.reshape(outshape)
def _matmul4bit_v2(x, qweight, scales, zeros, g_idx):
"""
input x: (n, m)
qweight: (j, k)
where m == j*8
perform x @ qweight
return y:
"""
if debug:
print('_matmul4bit_v2')
assert qweight.shape[0] * 8 == x.shape[-1]
outshape = x.shape[:-1] + (qweight.shape[1],)
x = x.reshape(-1, x.shape[-1])
y = torch.zeros((x.shape[0], qweight.shape[-1]), dtype=torch.float32, device=x.device)
dtype = x.dtype
x = x.half()
quant_cuda.vecquant4matmul_faster(x, qweight, y, scales, zeros, g_idx, x.shape[-1] // 2)
y = y.to(dtype)
return y.reshape(outshape)
def _matmul4bit_v1_recons(x, qweight, scales, zeros, transpose=False):
if debug:
print('_matmul4bit_v1_recons')
if not transpose:
assert qweight.shape[0] * 8 == x.shape[-1]
else:
assert qweight.shape[1] == x.shape[-1]
buffer = get_buffer(qweight.shape, dtype=scales.dtype, device=qweight.device)
quant_cuda.vecquant4recons_v1(qweight, buffer, scales, zeros)
if not transpose:
output = torch.matmul(x, buffer)
else:
output = torch.matmul(x, buffer.T)
return output
def _matmul4bit_v2_recons(x, qweight, scales, zeros, g_idx, transpose=False):
if debug:
print('_matmul4bit_v2_recons')
if not transpose:
assert qweight.shape[0] * 8 == x.shape[-1]
else:
assert qweight.shape[1] == x.shape[-1]
buffer = get_buffer(qweight.shape, dtype=scales.dtype, device=qweight.device)
quant_cuda.vecquant4recons_v2(qweight, buffer, scales, zeros, g_idx)
if not transpose:
output = torch.matmul(x, buffer)
else:
output = torch.matmul(x, buffer.T)
return output
def matmul4bit(x, qweight, scales, zeros, g_idx=None):
# detect if zeros is int32
if zeros.dtype != torch.int32:
# use v1
if use_new:
if auto_switch:
if np.prod(x.shape[:-1]) > auto_switch_thd:
output = _matmul4bit_v1_recons(x.to(scales.dtype), qweight, scales, zeros)
else:
output = _matmul4bit_v1(x, qweight, scales.float(), zeros.float())
else:
output = _matmul4bit_v1(x, qweight, scales.float(), zeros.float())
else:
if g_idx is None:
g_idx = torch.zeros(qweight.shape[0] * 8, dtype=torch.int32, device=x.device)
# use v2
if use_new:
if auto_switch:
if np.prod(x.shape[:-1]) > auto_switch_thd:
output = _matmul4bit_v2_recons(x.to(scales.dtype), qweight, scales, zeros, g_idx)
else:
output = _matmul4bit_v2(x, qweight, scales.float(), zeros, g_idx)
else:
output = _matmul4bit_v2(x, qweight, scales.float(), zeros, g_idx)
return output
def v2_to_v1(scales, zeros):
"""
Convert zeros in V2 model to V1 model when group_num = 1, for debugging
depreciated
"""
assert zeros.shape[0] == 1
z_mat = torch.zeros((zeros.shape[1], 256), dtype=torch.int, device=zeros.device) + zeros.reshape((-1,1))
z_buffer = torch.zeros((z_mat.shape[0] * 8, z_mat.shape[1]), dtype=torch.float16, device=zeros.device)
z_zeros = torch.zeros(z_mat.shape[1], dtype=torch.float16, device=zeros.device)
z_scales = torch.ones(z_mat.shape[1], dtype=torch.float16, device=zeros.device)
quant_cuda.vecquant4recons_v1(z_mat, z_buffer, z_scales, z_zeros)
z_buffer = z_buffer[:,0]
zeros_recons = z_buffer * scales + scales
return zeros_recons
+51 -7
View File
@@ -22,36 +22,80 @@ How do we do this?
```sh
conda create -n textgen3 python=3.10.9
conda activate textgen3
mamba install pytorch torchvision torchaudio pytorch-cuda=11.7 cudatoolkit-dev==11.7 cudatoolkit=11.7 -c pytorch -c nvidia -c conda-forge
conda create -n textgen4 python=3.10.9 -y
conda activate textgen4
mamba install pytorch torchvision torchaudio pytorch-cuda=11.7 cudatoolkit-dev==11.7 cudatoolkit=11.7 -c pytorch -c nvidia -c conda-forge -y
pip install -r requirements.txt
pip install git+https://github.com/sterlind/GPTQ-for-LLaMa.git@lora_4bit
```
# download models
```sh
# # base models.... FIXME
python scripts/download-model.py decapoda-research/llama-7b-hf
python scripts/download-model.py decapoda-research/llama-13b-hf
python scripts/download-model.py decapoda-research/llama-30b-hf
python scripts/download-model.py decapoda-research/llama-65b-hf
# 4bit ones if usefull?
python scripts/download-model.py decapoda-research/llama-7b-hf-int4
python scripts/download-model.py decapoda-research/llama-13b-hf-int4
python scripts/download-model.py decapoda-research/llama-30b-hf-int4
wget https://huggingface.co/maderix/llama-65b-4bit/resolve/main/llama30b-4bit.pt ./models/decapoda-research_llama-30b-hf-int4/llama-30b-4bit.pt
# because the last repo is mostly empty we will combine...
python scripts/download-model.py decapoda-research/llama-65b-hf-int4
wget https://huggingface.co/maderix/llama-65b-4bit/resolve/main/llama65b-4bit.pt ./models/decapoda-research_llama-65b-hf-int4/llama-65b-4bit.pt
cp models/decapoda-research_llama-7b-hf/*.json models/decapoda-research_llama-7b-hf-int4
# oh! you need to replace LLaMATokenizer with LlamaTokenizer in all model json files
# usefull?
# wget https://huggingface.co/maderix/llama-65b-4bit/resolve/main/llama30b-4bit.pt ../llama-30b-4bit.pt
# wget https://huggingface.co/maderix/llama-65b-4bit/resolve/main/llama13b-4bit.pt ../llama-13b-4bit.pt
# wget https://huggingface.co/maderix/llama-65b-4bit/resolve/main/llama7b-4bit.pt ../llama-7b-4bit.pt
# cools models:
# - https://huggingface.co/jordiclive/gpt4all-alpaca-oa-codealpaca-lora-13b
# - https://huggingface.co/Black-Engineer/oasst-llama30b-ggml-q4
# - https://huggingface.co/chansung/alpaca-lora-30b
# download loras
python scripts/download-model.py chansung/alpaca-lora-30b
python scripts/download-model.py chansung/alpaca-lora-13b
python scripts/download-model.py tloen/alpaca-lora-7b
python scripts/download-model.py gpt4all-alpaca-oa-codealpaca-lora-13b
python scripts/download-model.py Black-Engineer/oasst-llama30b-ggml-q4
```
# convert models
```sh
# download
python scripts/download-model.py tloen/alpaca-lora-7b
python scripts/download-model.py decapoda-research/llama-7b-hf
# convert
python scripts/export_hf_checkpoint.py ./models/decapoda-research_llama-13b-hf -l loras/chansung_alpaca-lora-13b
# or from int4?
python -m pdb scripts/export_hf_checkpoint_int4.py ./models/decapoda-research_llama-7b-hf ./models/decapoda-research_llama-7b-hf-int4/llama-7b-4bit.pt -l ./loras/tloen_alpaca-lora-7b
python scripts/export_hf_checkpoint.py ./models/llama-7b-hf -l loras/tloen_alpaca-lora-7b
python scripts/export_hf_checkpoint.py ./models/llama-13b-hf -l loras/chansung_alpaca-lora-13b # crash! 50GB+ needed
python scripts/export_hf_checkpoint.py ./models/llama-30b-hf -l loras/chansung_alpaca-lora-30b
python scripts/export_hf_checkpoint.py ./models/llama-60b-hf -l loras/chansung_alpaca-lora-60b
# test
python scripts/test_01_delora.py models/tloen_alpaca-lora-7b-delorified
```
# Links
- https://github.com/s4rduk4r/alpaca_lora_4bit_readme/blob/main/README.md
# 2023-04-13 16:44:11
OK I need lots more mem... copy to ec2
```sh
rsync -a . alpaca:/home/ubuntu/alpaca_convert_mjc --exclude=models
```
+1 -2
View File
@@ -7,7 +7,6 @@ safetensors
triton
colorama
git+https://github.com/huggingface/transformers.git@656e869
git+https://github.com/sterlind/GPTQ-for-LLaMa.git@lora_4bit
# git+https://github.com/sterlind/peft.git@085c09d
# git+https://github.com/sterlind/GPTQ-for-LLaMa.git@lora_4bit
git+https://github.com/wassname/peft.git
-e .
+3
View File
@@ -33,6 +33,8 @@ def main(BASE_MODEL, LORA_MODEL, output_path=None):
torch_dtype=torch.float16,
device_map={"": "cpu"},
)
# TODO or load 4 bit?
first_weight = base_model.model.layers[0].self_attn.q_proj.weight
first_weight_old = first_weight.clone()
@@ -77,6 +79,7 @@ def main(BASE_MODEL, LORA_MODEL, output_path=None):
print(o)
prompts_path = Path(output_path) / 'test_prompts.txt'
prompts_path.open('w').write(o)
print(prompts_path)
if __name__=="__main__":
parser = argparse.ArgumentParser()
+100
View File
@@ -0,0 +1,100 @@
"""
From https://raw.githubusercontent.com/tloen/alpaca-lora/main/export_hf_checkpoint.py
"""
import os
from pathlib import Path
import argparse
import torch
import transformers
from peft import PeftModel
from transformers import LlamaForCausalLM, LlamaTokenizer # noqa: F402
import autograd_4bit
from autograd_4bit import load_llama_model_4bit_low_ram, Autograd4bitQuantLinear
def main(BASE_MODEL, LORA_MODEL, int4_checkpoint_path, output_path=None):
if output_path is None:
output_path = 'models/' + LORA_MODEL.split('/')[-1] + '-delorified'
# load 4bit, from https://github.com/johnsmith0031/alpaca_lora_4bit/blob/fb7665726e5b69dcac6020707bbece7b0d39b865/text-generation-webui/custom_monkey_patch.py#L4
model, tokenizer = load_llama_model_4bit_low_ram(config_path=BASE_MODEL, model_path=int4_checkpoint_path, groupsize=-1, is_v1_model=True)
lora_model = PeftModel.from_pretrained(model, LORA_MODEL, device_map={'': "cpu"}, torch_dtype=torch.float16)
print('{} Lora Applied.'.format(lora_path))
print('Apply auto switch and half')
for n, m in lora_model.named_modules():
if isinstance(m, Autograd4bitQuantLinear) or isinstance(m, Linear4bitLt):
if m.is_v1_model:
m.zeros = m.zeros.half()
m.scales = m.scales.half()
m.bias = m.bias.half()
autograd_4bit.use_new = True
autograd_4bit.auto_switch = True
# tokenizer = LlamaTokenizer.from_pretrained(BASE_MODEL)
# base_model = LlamaForCausalLM.from_pretrained(
# BASE_MODEL,
# load_in_8bit=False,
# torch_dtype=torch.float16,
# device_map={"": "cpu"},
# )
# # TODO or load 4 bit?
# first_weight = base_model.model.layers[0].self_attn.q_proj.weight
# first_weight_old = first_weight.clone()
# lora_model = PeftModel.from_pretrained(
# base_model,
# LORA_MODEL,
# device_map={"": "cpu"},
# torch_dtype=torch.float16,
# )
lora_weight = lora_model.base_model.model.model.layers[
0
].self_attn.q_proj.weight
assert torch.allclose(first_weight_old, first_weight)
# merge weights - new merging method from peft
lora_model = lora_model.merge_and_unload()
lora_model.train(False)
# did we do anything?
assert not torch.allclose(first_weight_old, first_weight)
lora_model_sd = lora_model.state_dict()
deloreanized_sd = {
k.replace("base_model.model.", ""): v
for k, v in lora_model_sd.items()
if "lora" not in k
}
LlamaForCausalLM.save_pretrained(
base_model, output_path, state_dict=deloreanized_sd, max_shard_size="400MB"
)
print(f'output {output_path}')
LlamaTokenizer.save_pretrained(tokenizer, output_path)
# FIXME also save tokenizer
from alpaca_convert.test import test_conversation
o = test_conversation(lora_model.float(), tokenizer)
print(o)
prompts_path = Path(output_path) / 'test_prompts.txt'
print(prompts_path)
prompts_path.open('w').write(o)
if __name__=="__main__":
parser = argparse.ArgumentParser()
parser.add_argument('model', type=str)
parser.add_argument('int4_checkpoint_path', type=str)
parser.add_argument('-l', '--lora', type=str, default='main', help='Lora repo or path e.g. `tloen/alpaca-lora-7b`')
parser.add_argument('-o', '--output', type=Path, default=None)
"e.g. ./hf_ckpt. default will be lora name"
args = parser.parse_args()
print(args)
main(args.model, args.lora, args.int4_checkpoint_path, args.output)