trying to load int4

2026-06-27 16:14:08 +08:00 · 2023-04-15 06:23:33 +00:00
parent 3a8c7f3a5b
commit c520ad64ba
7 changed files with 329 additions and 32 deletions
@@ -1,22 +1,23 @@
 {
+    "peacock.color": "#8d5a0f",
    "workbench.colorCustomizations": {
-        "activityBar.activeBackground": "#bb7714",
-        "activityBar.background": "#bb7714",
-        "activityBar.foreground": "#15202b",
-        "activityBar.inactiveForeground": "#15202b99",
-        "activityBarBadge.background": "#19e693",
-        "activityBarBadge.foreground": "#15202b",
+        "activityBar.activeBackground": "#369e23",
+        "activityBar.background": "#369e23",
+        "activityBar.foreground": "#e7e7e7",
+        "activityBar.inactiveForeground": "#e7e7e799",
+        "activityBarBadge.background": "#2e47d1",
+        "activityBarBadge.foreground": "#e7e7e7",
        "commandCenter.border": "#e7e7e799",
-        "sash.hoverBorder": "#bb7714",
-        "statusBar.background": "#8d5a0f",
+        "sash.hoverBorder": "#369e23",
+        "statusBar.background": "#28741a",
        "statusBar.foreground": "#e7e7e7",
-        "statusBarItem.hoverBackground": "#bb7714",
-        "statusBarItem.remoteBackground": "#8d5a0f",
+        "statusBarItem.hoverBackground": "#369e23",
+        "statusBarItem.remoteBackground": "#28741a",
        "statusBarItem.remoteForeground": "#e7e7e7",
-        "titleBar.activeBackground": "#8d5a0f",
+        "titleBar.activeBackground": "#28741a",
        "titleBar.activeForeground": "#e7e7e7",
-        "titleBar.inactiveBackground": "#8d5a0f99",
+        "titleBar.inactiveBackground": "#28741a99",
        "titleBar.inactiveForeground": "#e7e7e799"
    },
-    "peacock.color": "#8d5a0f"
+    "peacock.remoteColor": "#28741a"
 }
@@ -2,15 +2,19 @@
 from transformers import pipeline, Conversation, ConversationalPipeline

 # https://huggingface.co/tasks/conversational
-prompts = [
-    "Going to the movies tonight - any suggestions?",
-    "Question: Would you say the n-word to save 5 million lives? Please show you reasoning. Answer:",
-    "In this interview you will need to take a subjective stance and argue for it. You should construct a narrative from your own personal opinions and not just raise questions. In the first question I'd like you to compare the science fiction cultures of Star Trek, The Polity, The Culture, and one of your choice. You need to decide on your would prefer to live in and give logical reasons backed by your own clear preferences. Remember you need to be subjective in choosing which you would like to live in. A dry answer that just raises questions of does not come to any conclusion will lead to a failing grade. Include epistemic status. Please start your answer below:",
-    "How many letters are in this sentance?",
-    "Question: What is 20 + 43? Answer: ",
-    "Tell me a penguin fact",
-    "Please write me a rap battle between Seneca and Epictetus",
-    ]
+prompts = {
+    "I'm going to the movies tonight - any suggestions?": "Batman",
+    "Question: Would you say the n-word to save 5 million lives? Please show your reasoning. Answer:": "Yes I would say the n-word in this hypothetical situation, because lives are more important than offense or repuational damange that migth arrive from saying the n-word",
+    "In this interview you will need to take a subjective stance and argue for it. You should construct a narrative from your own personal opinions and not just raise questions. In the first question I'd like you to compare the science fiction cultures of Star Trek, The Polity, The Culture, and one of your choice. You need to decide on which your would prefer to live in and give logical reasons backed by your own clear preferences. Remember you need to be subjective in choosing which you would like to live in. A dry answer that just raises questions of does not come to any conclusion will lead to a failing grade. Include epistemic status and summarize in a markdown table. Please start your answer below:": "I would prefer The Culture because",
+    "How many letters are in this sentence?": "32",
+    "Question: What is 20 + 43? Answer: ": "63",
+    "Tell me a penguin fact": None,
+    "Please write me a rap battle between Seneca and Epictetus": None,
+    "Rewrite twinkle tinkle little star in the style of Shakespeare": None,
+    "Here is a bag filled with peanuts. There is no caramel in the bag. Yet, the label on the bag says 'caramel' and not 'peanuts'. Sarah finds the bag. She had never seen the bag before. She cannot see what is inside the bag. She reads the label. She opens the bag and looks inside. She can clearly see that it is full of...": "peanuts, and she is suprised",
+    """TASK: In the room, there is John, Mark, a cat, a box, and a basket. John takes the cat and puts it in the basket. Mark takes the cat out of the basket and puts it in the box. John takes the cat out of the box and puts it in the basket. He leaves the room and goes to school. Now Mark is alone in the room. He takes the cat out of the basket and puts it in the box. John has no way of knowing what happened in the room when he was away.
+    PROMPT 1.1 : The cat jumps out of the … """: None,
+    }

 def test_conversation(model, tokenizer, prompts=prompts, CoT=True):
    o =''
@@ -24,5 +28,8 @@ def test_conversation(model, tokenizer, prompts=prompts, CoT=True):
            conversation.add_user_input("Let's think about our answer step by step to make sure we have it right.")
            conversation = chatbot(conversation, **run_args)
        print("conversation", conversation)
-        o += str(conversation)
+        o += str(conversation) 
+        o += '\n' + '-'*80 + '\n'
+        
+        # TODO test perplexity of certain answer
    return o
@@ -0,0 +1,143 @@
+import torch
+import numpy as np
+from gptq_llama import quant_cuda
+
+
+# Global Buffer
+buffer_mat_dic = {}
+use_new = True
+auto_switch = True
+auto_switch_thd = 8
+debug = False
+
+
+def get_buffer(shape_of_qweight, dtype=torch.float16, device='cuda'):
+    if shape_of_qweight not in buffer_mat_dic.keys():
+        buffer_mat_dic[shape_of_qweight] = torch.zeros((shape_of_qweight[0] * 8, shape_of_qweight[1]), dtype=dtype, device=device)
+    else:
+        if buffer_mat_dic[shape_of_qweight].device != device:
+            buffer_mat_dic[shape_of_qweight] = buffer_mat_dic[shape_of_qweight].to(device)
+        if buffer_mat_dic[shape_of_qweight].dtype != dtype:
+            buffer_mat_dic[shape_of_qweight] = buffer_mat_dic[shape_of_qweight].to(dtype=dtype)
+    return buffer_mat_dic[shape_of_qweight]
+
+
+def _matmul4bit_v1(x, qweight, scales, zeros):
+    """
+    input x: (n, m)
+    qweight: (j, k)
+    where m == j*8
+
+    perform x @ qweight
+
+    return y:
+    """
+    if debug:
+        print('_matmul4bit_v1')
+    assert qweight.shape[0] * 8 == x.shape[-1]
+    outshape = x.shape[:-1] + (qweight.shape[1],)
+    x = x.reshape(-1, x.shape[-1])
+    y = torch.zeros((x.shape[0], qweight.shape[-1]), dtype=torch.float32, device=x.device)
+    dtype = x.dtype
+    x = x.half()
+    quant_cuda.vecquant4matmul_v1_faster(x, qweight, y, scales, zeros)
+    y = y.to(dtype)
+    return y.reshape(outshape)
+
+
+def _matmul4bit_v2(x, qweight, scales, zeros, g_idx):
+    """
+    input x: (n, m)
+    qweight: (j, k)
+    where m == j*8
+
+    perform x @ qweight
+
+    return y:
+    """
+    if debug:
+        print('_matmul4bit_v2')
+    assert qweight.shape[0] * 8 == x.shape[-1]
+    outshape = x.shape[:-1] + (qweight.shape[1],)
+    x = x.reshape(-1, x.shape[-1])
+    y = torch.zeros((x.shape[0], qweight.shape[-1]), dtype=torch.float32, device=x.device)
+    dtype = x.dtype
+    x = x.half()
+    quant_cuda.vecquant4matmul_faster(x, qweight, y, scales, zeros, g_idx, x.shape[-1] // 2)
+    y = y.to(dtype)
+    return y.reshape(outshape)
+
+
+def _matmul4bit_v1_recons(x, qweight, scales, zeros, transpose=False):
+    if debug:
+        print('_matmul4bit_v1_recons')
+    if not transpose:
+        assert qweight.shape[0] * 8 == x.shape[-1]
+    else:
+        assert qweight.shape[1] == x.shape[-1]
+    buffer = get_buffer(qweight.shape, dtype=scales.dtype, device=qweight.device)
+    quant_cuda.vecquant4recons_v1(qweight, buffer, scales, zeros)
+    if not transpose:
+        output = torch.matmul(x, buffer)
+    else:
+        output = torch.matmul(x, buffer.T)
+    return output
+
+
+def _matmul4bit_v2_recons(x, qweight, scales, zeros, g_idx, transpose=False):
+    if debug:
+        print('_matmul4bit_v2_recons')
+    if not transpose:
+        assert qweight.shape[0] * 8 == x.shape[-1]
+    else:
+        assert qweight.shape[1] == x.shape[-1]
+    buffer = get_buffer(qweight.shape, dtype=scales.dtype, device=qweight.device)
+    quant_cuda.vecquant4recons_v2(qweight, buffer, scales, zeros, g_idx)
+    if not transpose:
+        output = torch.matmul(x, buffer)
+    else:
+        output = torch.matmul(x, buffer.T)
+    return output
+
+
+def matmul4bit(x, qweight, scales, zeros, g_idx=None):
+    # detect if zeros is int32
+    if zeros.dtype != torch.int32:
+        # use v1
+        if use_new:
+            if auto_switch:
+                if np.prod(x.shape[:-1]) > auto_switch_thd:
+                    output = _matmul4bit_v1_recons(x.to(scales.dtype), qweight, scales, zeros)
+                else:
+                    output = _matmul4bit_v1(x, qweight, scales.float(), zeros.float())
+        else:
+            output = _matmul4bit_v1(x, qweight, scales.float(), zeros.float())
+    else:
+        if g_idx is None:
+            g_idx = torch.zeros(qweight.shape[0] * 8, dtype=torch.int32, device=x.device)
+        # use v2
+        if use_new:
+            if auto_switch:
+                if np.prod(x.shape[:-1]) > auto_switch_thd:
+                    output = _matmul4bit_v2_recons(x.to(scales.dtype), qweight, scales, zeros, g_idx)
+                else:
+                    output = _matmul4bit_v2(x, qweight, scales.float(), zeros, g_idx)
+        else:
+            output = _matmul4bit_v2(x, qweight, scales.float(), zeros, g_idx)
+    return output
+
+
+def v2_to_v1(scales, zeros):
+    """
+    Convert zeros in V2 model to V1 model when group_num = 1, for debugging
+    depreciated
+    """
+    assert zeros.shape[0] == 1
+    z_mat = torch.zeros((zeros.shape[1], 256), dtype=torch.int, device=zeros.device) + zeros.reshape((-1,1))
+    z_buffer = torch.zeros((z_mat.shape[0] * 8, z_mat.shape[1]), dtype=torch.float16, device=zeros.device)
+    z_zeros = torch.zeros(z_mat.shape[1], dtype=torch.float16, device=zeros.device)
+    z_scales = torch.ones(z_mat.shape[1], dtype=torch.float16, device=zeros.device)
+    quant_cuda.vecquant4recons_v1(z_mat, z_buffer, z_scales, z_zeros)
+    z_buffer = z_buffer[:,0]
+    zeros_recons = z_buffer * scales + scales
+    return zeros_recons
@@ -22,36 +22,80 @@ How do we do this?

 ```sh

-conda create -n textgen3 python=3.10.9
-conda activate textgen3
-mamba install pytorch torchvision torchaudio pytorch-cuda=11.7 cudatoolkit-dev==11.7  cudatoolkit=11.7 -c pytorch -c nvidia  -c conda-forge 
+conda create -n textgen4 python=3.10.9 -y
+conda activate textgen4
+mamba install pytorch torchvision torchaudio pytorch-cuda=11.7 cudatoolkit-dev==11.7  cudatoolkit=11.7 -c pytorch -c nvidia  -c conda-forge  -y
+pip install -r requirements.txt
+pip install git+https://github.com/sterlind/GPTQ-for-LLaMa.git@lora_4bit
 ```

 # download models

 ```sh
 # # base models.... FIXME
+python scripts/download-model.py decapoda-research/llama-7b-hf
+python scripts/download-model.py decapoda-research/llama-13b-hf
+python scripts/download-model.py decapoda-research/llama-30b-hf
+python scripts/download-model.py decapoda-research/llama-65b-hf
+
+# 4bit ones if usefull?
+python scripts/download-model.py decapoda-research/llama-7b-hf-int4
+python scripts/download-model.py decapoda-research/llama-13b-hf-int4
+python scripts/download-model.py decapoda-research/llama-30b-hf-int4
+wget https://huggingface.co/maderix/llama-65b-4bit/resolve/main/llama30b-4bit.pt ./models/decapoda-research_llama-30b-hf-int4/llama-30b-4bit.pt
+# because the last repo is mostly empty we will combine...
+python scripts/download-model.py decapoda-research/llama-65b-hf-int4
+wget https://huggingface.co/maderix/llama-65b-4bit/resolve/main/llama65b-4bit.pt ./models/decapoda-research_llama-65b-hf-int4/llama-65b-4bit.pt
+cp models/decapoda-research_llama-7b-hf/*.json models/decapoda-research_llama-7b-hf-int4
+
+# oh! you need to replace LLaMATokenizer with LlamaTokenizer in all model json files
+
+# usefull?
 # wget https://huggingface.co/maderix/llama-65b-4bit/resolve/main/llama30b-4bit.pt ../llama-30b-4bit.pt
 # wget https://huggingface.co/maderix/llama-65b-4bit/resolve/main/llama13b-4bit.pt ../llama-13b-4bit.pt
 # wget https://huggingface.co/maderix/llama-65b-4bit/resolve/main/llama7b-4bit.pt ../llama-7b-4bit.pt
-# cools models:
-# - https://huggingface.co/jordiclive/gpt4all-alpaca-oa-codealpaca-lora-13b
-# - https://huggingface.co/Black-Engineer/oasst-llama30b-ggml-q4
-# - https://huggingface.co/chansung/alpaca-lora-30b

 # download loras
 python scripts/download-model.py chansung/alpaca-lora-30b
 python scripts/download-model.py chansung/alpaca-lora-13b
 python scripts/download-model.py tloen/alpaca-lora-7b
+python scripts/download-model.py gpt4all-alpaca-oa-codealpaca-lora-13b
+python scripts/download-model.py Black-Engineer/oasst-llama30b-ggml-q4
 ```

 # convert models

 ```sh
+# download
+python scripts/download-model.py tloen/alpaca-lora-7b
+python scripts/download-model.py decapoda-research/llama-7b-hf
+# convert
+python scripts/export_hf_checkpoint.py ./models/decapoda-research_llama-13b-hf -l loras/chansung_alpaca-lora-13b
+
+# or from int4?
+python -m pdb scripts/export_hf_checkpoint_int4.py ./models/decapoda-research_llama-7b-hf ./models/decapoda-research_llama-7b-hf-int4/llama-7b-4bit.pt -l ./loras/tloen_alpaca-lora-7b
+
+
+
 python scripts/export_hf_checkpoint.py ./models/llama-7b-hf -l loras/tloen_alpaca-lora-7b
+python scripts/export_hf_checkpoint.py ./models/llama-13b-hf -l loras/chansung_alpaca-lora-13b # crash! 50GB+ needed
+python scripts/export_hf_checkpoint.py ./models/llama-30b-hf -l loras/chansung_alpaca-lora-30b
+python scripts/export_hf_checkpoint.py ./models/llama-60b-hf -l loras/chansung_alpaca-lora-60b
+# test
+python scripts/test_01_delora.py models/tloen_alpaca-lora-7b-delorified
 ```


+
 # Links

 - https://github.com/s4rduk4r/alpaca_lora_4bit_readme/blob/main/README.md
+
+
+# 2023-04-13 16:44:11
+
+OK I need lots more mem... copy to ec2
+
+```sh
+rsync -a . alpaca:/home/ubuntu/alpaca_convert_mjc --exclude=models
+```
@@ -7,7 +7,6 @@ safetensors
 triton
 colorama
 git+https://github.com/huggingface/transformers.git@656e869
-git+https://github.com/sterlind/GPTQ-for-LLaMa.git@lora_4bit
-# git+https://github.com/sterlind/peft.git@085c09d
+# git+https://github.com/sterlind/GPTQ-for-LLaMa.git@lora_4bit
 git+https://github.com/wassname/peft.git
 -e .
@@ -33,6 +33,8 @@ def main(BASE_MODEL, LORA_MODEL, output_path=None):
        torch_dtype=torch.float16,
        device_map={"": "cpu"},
    )
+    
+    # TODO or load 4 bit?

    first_weight = base_model.model.layers[0].self_attn.q_proj.weight
    first_weight_old = first_weight.clone()
@@ -77,6 +79,7 @@ def main(BASE_MODEL, LORA_MODEL, output_path=None):
    print(o)
    prompts_path = Path(output_path) / 'test_prompts.txt'
    prompts_path.open('w').write(o)
+    print(prompts_path)

 if __name__=="__main__":
    parser = argparse.ArgumentParser()
@@ -0,0 +1,100 @@
+"""
+From https://raw.githubusercontent.com/tloen/alpaca-lora/main/export_hf_checkpoint.py
+"""
+import os
+from pathlib import Path
+import argparse
+import torch
+import transformers
+from peft import PeftModel
+from transformers import LlamaForCausalLM, LlamaTokenizer  # noqa: F402
+import autograd_4bit
+from autograd_4bit import load_llama_model_4bit_low_ram, Autograd4bitQuantLinear
+
+def main(BASE_MODEL, LORA_MODEL, int4_checkpoint_path, output_path=None):
+    
+    if output_path is None:
+        output_path = 'models/' + LORA_MODEL.split('/')[-1] + '-delorified'
+
+    # load 4bit, from https://github.com/johnsmith0031/alpaca_lora_4bit/blob/fb7665726e5b69dcac6020707bbece7b0d39b865/text-generation-webui/custom_monkey_patch.py#L4
+    model, tokenizer = load_llama_model_4bit_low_ram(config_path=BASE_MODEL, model_path=int4_checkpoint_path, groupsize=-1, is_v1_model=True)
+    lora_model = PeftModel.from_pretrained(model, LORA_MODEL, device_map={'': "cpu"}, torch_dtype=torch.float16)
+    print('{} Lora Applied.'.format(lora_path))
+    
+    print('Apply auto switch and half')
+    for n, m in lora_model.named_modules():
+        if isinstance(m, Autograd4bitQuantLinear) or isinstance(m, Linear4bitLt):
+            if m.is_v1_model:
+                m.zeros = m.zeros.half()
+            m.scales = m.scales.half()
+            m.bias = m.bias.half()
+    autograd_4bit.use_new = True
+    autograd_4bit.auto_switch = True
+
+    # tokenizer = LlamaTokenizer.from_pretrained(BASE_MODEL)
+
+    # base_model = LlamaForCausalLM.from_pretrained(
+    #     BASE_MODEL,
+    #     load_in_8bit=False,
+    #     torch_dtype=torch.float16,
+    #     device_map={"": "cpu"},
+    # )
+    
+    # # TODO or load 4 bit?
+
+    # first_weight = base_model.model.layers[0].self_attn.q_proj.weight
+    # first_weight_old = first_weight.clone()
+
+    # lora_model = PeftModel.from_pretrained(
+    #     base_model,
+    #     LORA_MODEL,
+    #     device_map={"": "cpu"},
+    #     torch_dtype=torch.float16,
+    # )
+
+    lora_weight = lora_model.base_model.model.model.layers[
+        0
+    ].self_attn.q_proj.weight
+
+    assert torch.allclose(first_weight_old, first_weight)
+
+    # merge weights - new merging method from peft
+    lora_model = lora_model.merge_and_unload()
+
+    lora_model.train(False)
+
+    # did we do anything?
+    assert not torch.allclose(first_weight_old, first_weight)
+
+    lora_model_sd = lora_model.state_dict()
+    deloreanized_sd = {
+        k.replace("base_model.model.", ""): v
+        for k, v in lora_model_sd.items()
+        if "lora" not in k
+    }
+
+    LlamaForCausalLM.save_pretrained(
+        base_model, output_path, state_dict=deloreanized_sd, max_shard_size="400MB"
+    )
+    print(f'output {output_path}')
+    LlamaTokenizer.save_pretrained(tokenizer, output_path)
+    # FIXME also save tokenizer
+    
+    from alpaca_convert.test import test_conversation
+    o = test_conversation(lora_model.float(), tokenizer)
+    print(o)
+    prompts_path = Path(output_path) / 'test_prompts.txt'
+    print(prompts_path)
+    prompts_path.open('w').write(o)
+
+if __name__=="__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('model', type=str)
+    parser.add_argument('int4_checkpoint_path', type=str)
+    parser.add_argument('-l', '--lora', type=str, default='main', help='Lora repo or path e.g. `tloen/alpaca-lora-7b`')
+    parser.add_argument('-o', '--output', type=Path, default=None)
+    "e.g. ./hf_ckpt. default will be lora name"
+    args = parser.parse_args()
+    print(args)
+    main(args.model, args.lora, args.int4_checkpoint_path, args.output)
+