From e6338b5b26cb98aaa80fa4426049bfd2e3bc731e Mon Sep 17 00:00:00 2001 From: wassname Date: Sat, 15 Apr 2023 06:37:43 +0000 Subject: [PATCH] rm int4 --- mjc_notes.md | 37 +++------- scripts/export_hf_checkpoint_int4.py | 100 --------------------------- scripts/test_01_delora.py | 4 ++ 3 files changed, 15 insertions(+), 126 deletions(-) delete mode 100644 scripts/export_hf_checkpoint_int4.py diff --git a/mjc_notes.md b/mjc_notes.md index 01b3d8b..cc3f55e 100644 --- a/mjc_notes.md +++ b/mjc_notes.md @@ -37,24 +37,8 @@ python scripts/download-model.py decapoda-research/llama-7b-hf python scripts/download-model.py decapoda-research/llama-13b-hf python scripts/download-model.py decapoda-research/llama-30b-hf python scripts/download-model.py decapoda-research/llama-65b-hf - -# 4bit ones if usefull? -python scripts/download-model.py decapoda-research/llama-7b-hf-int4 -python scripts/download-model.py decapoda-research/llama-13b-hf-int4 -python scripts/download-model.py decapoda-research/llama-30b-hf-int4 -wget https://huggingface.co/maderix/llama-65b-4bit/resolve/main/llama30b-4bit.pt ./models/decapoda-research_llama-30b-hf-int4/llama-30b-4bit.pt -# because the last repo is mostly empty we will combine... -python scripts/download-model.py decapoda-research/llama-65b-hf-int4 -wget https://huggingface.co/maderix/llama-65b-4bit/resolve/main/llama65b-4bit.pt ./models/decapoda-research_llama-65b-hf-int4/llama-65b-4bit.pt -cp models/decapoda-research_llama-7b-hf/*.json models/decapoda-research_llama-7b-hf-int4 - # oh! you need to replace LLaMATokenizer with LlamaTokenizer in all model json files -# usefull? -# wget https://huggingface.co/maderix/llama-65b-4bit/resolve/main/llama30b-4bit.pt ../llama-30b-4bit.pt -# wget https://huggingface.co/maderix/llama-65b-4bit/resolve/main/llama13b-4bit.pt ../llama-13b-4bit.pt -# wget https://huggingface.co/maderix/llama-65b-4bit/resolve/main/llama7b-4bit.pt ../llama-7b-4bit.pt - # download loras python scripts/download-model.py chansung/alpaca-lora-30b python scripts/download-model.py chansung/alpaca-lora-13b @@ -69,20 +53,21 @@ python scripts/download-model.py Black-Engineer/oasst-llama30b-ggml-q4 # download python scripts/download-model.py tloen/alpaca-lora-7b python scripts/download-model.py decapoda-research/llama-7b-hf + # convert -python scripts/export_hf_checkpoint.py ./models/decapoda-research_llama-13b-hf -l loras/chansung_alpaca-lora-13b +python scripts/export_hf_checkpoint.py ./models/decapoda-research_llama-7b-hf -l loras/tloen_alpaca-lora-7b +python scripts/export_hf_checkpoint.py ./models/decapoda-research_llama-13b-hf -l loras/chansung_alpaca-lora-13b # crash! 50GB+ needed +python scripts/export_hf_checkpoint.py ./models/decapoda-research_llama-30b-hf -l loras/chansung_alpaca-lora-30b +python scripts/export_hf_checkpoint.py ./models/decapoda-research_llama-60b-hf -l loras/chansung_alpaca-lora-60b -# or from int4? -python -m pdb scripts/export_hf_checkpoint_int4.py ./models/decapoda-research_llama-7b-hf ./models/decapoda-research_llama-7b-hf-int4/llama-7b-4bit.pt -l ./loras/tloen_alpaca-lora-7b - - - -python scripts/export_hf_checkpoint.py ./models/llama-7b-hf -l loras/tloen_alpaca-lora-7b -python scripts/export_hf_checkpoint.py ./models/llama-13b-hf -l loras/chansung_alpaca-lora-13b # crash! 50GB+ needed -python scripts/export_hf_checkpoint.py ./models/llama-30b-hf -l loras/chansung_alpaca-lora-30b -python scripts/export_hf_checkpoint.py ./models/llama-60b-hf -l loras/chansung_alpaca-lora-60b # test python scripts/test_01_delora.py models/tloen_alpaca-lora-7b-delorified +python scripts/test_01_delora.py models/chansung_alpaca-lora-13b-delorified +# now compare what was generated during conversion `test_prompts.txt`, to the loaded version + +# 4bit... + +# ggml conversion... ``` diff --git a/scripts/export_hf_checkpoint_int4.py b/scripts/export_hf_checkpoint_int4.py deleted file mode 100644 index 3d5a979..0000000 --- a/scripts/export_hf_checkpoint_int4.py +++ /dev/null @@ -1,100 +0,0 @@ -""" -From https://raw.githubusercontent.com/tloen/alpaca-lora/main/export_hf_checkpoint.py -""" -import os -from pathlib import Path -import argparse -import torch -import transformers -from peft import PeftModel -from transformers import LlamaForCausalLM, LlamaTokenizer # noqa: F402 -import autograd_4bit -from autograd_4bit import load_llama_model_4bit_low_ram, Autograd4bitQuantLinear - -def main(BASE_MODEL, LORA_MODEL, int4_checkpoint_path, output_path=None): - - if output_path is None: - output_path = 'models/' + LORA_MODEL.split('/')[-1] + '-delorified' - - # load 4bit, from https://github.com/johnsmith0031/alpaca_lora_4bit/blob/fb7665726e5b69dcac6020707bbece7b0d39b865/text-generation-webui/custom_monkey_patch.py#L4 - model, tokenizer = load_llama_model_4bit_low_ram(config_path=BASE_MODEL, model_path=int4_checkpoint_path, groupsize=-1, is_v1_model=True) - lora_model = PeftModel.from_pretrained(model, LORA_MODEL, device_map={'': "cpu"}, torch_dtype=torch.float16) - print('{} Lora Applied.'.format(lora_path)) - - print('Apply auto switch and half') - for n, m in lora_model.named_modules(): - if isinstance(m, Autograd4bitQuantLinear) or isinstance(m, Linear4bitLt): - if m.is_v1_model: - m.zeros = m.zeros.half() - m.scales = m.scales.half() - m.bias = m.bias.half() - autograd_4bit.use_new = True - autograd_4bit.auto_switch = True - - # tokenizer = LlamaTokenizer.from_pretrained(BASE_MODEL) - - # base_model = LlamaForCausalLM.from_pretrained( - # BASE_MODEL, - # load_in_8bit=False, - # torch_dtype=torch.float16, - # device_map={"": "cpu"}, - # ) - - # # TODO or load 4 bit? - - # first_weight = base_model.model.layers[0].self_attn.q_proj.weight - # first_weight_old = first_weight.clone() - - # lora_model = PeftModel.from_pretrained( - # base_model, - # LORA_MODEL, - # device_map={"": "cpu"}, - # torch_dtype=torch.float16, - # ) - - lora_weight = lora_model.base_model.model.model.layers[ - 0 - ].self_attn.q_proj.weight - - assert torch.allclose(first_weight_old, first_weight) - - # merge weights - new merging method from peft - lora_model = lora_model.merge_and_unload() - - lora_model.train(False) - - # did we do anything? - assert not torch.allclose(first_weight_old, first_weight) - - lora_model_sd = lora_model.state_dict() - deloreanized_sd = { - k.replace("base_model.model.", ""): v - for k, v in lora_model_sd.items() - if "lora" not in k - } - - LlamaForCausalLM.save_pretrained( - base_model, output_path, state_dict=deloreanized_sd, max_shard_size="400MB" - ) - print(f'output {output_path}') - LlamaTokenizer.save_pretrained(tokenizer, output_path) - # FIXME also save tokenizer - - from alpaca_convert.test import test_conversation - o = test_conversation(lora_model.float(), tokenizer) - print(o) - prompts_path = Path(output_path) / 'test_prompts.txt' - print(prompts_path) - prompts_path.open('w').write(o) - -if __name__=="__main__": - parser = argparse.ArgumentParser() - parser.add_argument('model', type=str) - parser.add_argument('int4_checkpoint_path', type=str) - parser.add_argument('-l', '--lora', type=str, default='main', help='Lora repo or path e.g. `tloen/alpaca-lora-7b`') - parser.add_argument('-o', '--output', type=Path, default=None) - "e.g. ./hf_ckpt. default will be lora name" - args = parser.parse_args() - print(args) - main(args.model, args.lora, args.int4_checkpoint_path, args.output) - diff --git a/scripts/test_01_delora.py b/scripts/test_01_delora.py index fc63c76..f6d59ed 100644 --- a/scripts/test_01_delora.py +++ b/scripts/test_01_delora.py @@ -27,6 +27,10 @@ tokenizer = LlamaTokenizer.from_pretrained(args.model) outs = test_conversation(model, tokenizer) print(outs) +prompts_path = Path(output_path) / 'test_prompts2.txt' +prompts_path.open('w').write(o) +print(prompts_path) + # from transformers import pipeline, Conversation, ConversationalPipeline # # https://huggingface.co/tasks/conversational # prompts = [