init

2026-06-27 14:58:51 +08:00 · 2023-04-10 16:15:52 +08:00
parent 5d3267d80d
commit 6e6c8f8e7a
22 changed files with 776 additions and 1982 deletions
@@ -5,3 +5,7 @@ llama-13b-4bit
 llama-13b-4bit.pt
 text-generation-webui/
 repository/
+loras/
+loras
+models/
+models
@@ -0,0 +1,22 @@
+{
+    "workbench.colorCustomizations": {
+        "activityBar.activeBackground": "#bb7714",
+        "activityBar.background": "#bb7714",
+        "activityBar.foreground": "#15202b",
+        "activityBar.inactiveForeground": "#15202b99",
+        "activityBarBadge.background": "#19e693",
+        "activityBarBadge.foreground": "#15202b",
+        "commandCenter.border": "#e7e7e799",
+        "sash.hoverBorder": "#bb7714",
+        "statusBar.background": "#8d5a0f",
+        "statusBar.foreground": "#e7e7e7",
+        "statusBarItem.hoverBackground": "#bb7714",
+        "statusBarItem.remoteBackground": "#8d5a0f",
+        "statusBarItem.remoteForeground": "#e7e7e7",
+        "titleBar.activeBackground": "#8d5a0f",
+        "titleBar.activeForeground": "#e7e7e7",
+        "titleBar.inactiveBackground": "#8d5a0f99",
+        "titleBar.inactiveForeground": "#e7e7e799"
+    },
+    "peacock.color": "#8d5a0f"
+}
@@ -1,78 +0,0 @@
-# syntax = docker/dockerfile:experimental
-
-# Dockerfile is split into parts because we want to cache building the requirements and downloading the model, both of which can take a long time.
-
-FROM nvidia/cuda:11.7.0-devel-ubuntu22.04 AS builder
-
-RUN apt-get update && apt-get install -y python3 python3-pip git
-
-RUN pip3 install --upgrade pip 
-
-# Some of the requirements expect some python packages in their setup.py, just install them first.
-RUN --mount=type=cache,target=/root/.cache/pip pip install --user torch==2.0.0
-RUN --mount=type=cache,target=/root/.cache/pip pip install --user semantic-version==2.10.0 requests tqdm
-
-# The docker build environment has trouble detecting CUDA version, build for all reasonable archs
-ENV TORCH_CUDA_ARCH_LIST="6.0 6.1 7.0 7.5 8.0 8.6"
-COPY requirements.txt requirements.txt
-RUN --mount=type=cache,target=/root/.cache pip install --user -r requirements.txt
-
-# -------------------------------
-
-# Download the model
-FROM nvidia/cuda:11.7.0-devel-ubuntu22.04 AS downloader
-RUN apt-get update && apt-get install -y wget
-
-RUN wget --progress=bar:force:noscroll https://huggingface.co/decapoda-research/llama-7b-hf-int4/resolve/main/llama-7b-4bit.pt
-
-
-
-# -------------------------------
-
-#FROM pytorch/pytorch:2.0.0-cuda11.7-cudnn8-devel
-FROM nvidia/cuda:11.7.0-devel-ubuntu22.04
-
-RUN --mount=type=cache,target=/var/cache/apt apt-get update && apt-get install -y git python3 python3-pip
-
-RUN ln -s `which python3` /usr/bin/python
-
-
-# Copy the installed packages from the first stage
-COPY --from=builder /root/.local /root/.local
-
-RUN mkdir alpaca_lora_4bit
-WORKDIR alpaca_lora_4bit
-
-COPY --from=downloader llama-7b-4bit.pt llama-7b-4bit.pt
-
-#RUN git clone --depth=1 --branch main https://github.com/andybarry/text-generation-webui-4bit.git text-generation-webui-tmp
-
-RUN git clone --depth=1 --branch main https://github.com/oobabooga/text-generation-webui.git text-generation-webui-tmp
-
-RUN --mount=type=cache,target=/root/.cache pip install --user markdown gradio
-
-# Apply monkey patch
-RUN cd text-generation-webui-tmp && printf '%s'"import custom_monkey_patch # apply monkey patch\nimport gc\n\n" | cat - server.py > tmpfile && mv tmpfile server.py
-
-# Get the model config
-RUN cd text-generation-webui-tmp && python download-model.py --text-only decapoda-research/llama-7b-hf && mv models/decapoda-research_llama-7b-hf ../llama-7b-4bit
-
-
-# Get LoRA
-RUN cd text-generation-webui-tmp && python download-model.py samwit/alpaca7b-lora && mv loras/samwit_alpaca7b-lora ../alpaca7b_lora
-
-COPY *.py .
-COPY text-generation-webui text-generation-webui
-COPY monkeypatch .
-
-RUN mv -f text-generation-webui-tmp/* text-generation-webui/
-
-# Symlink for monkeypatch
-RUN cd text-generation-webui && ln -s ../autograd_4bit.py ./autograd_4bit.py && ln -s ../matmul_utils_4bit.py .
-
-# Swap to the 7bn parameter model
-RUN sed -i 's/llama-13b-4bit/llama-7b-4bit/g' text-generation-webui/custom_monkey_patch.py && sed -i 's/alpaca13b_lora/alpaca7b_lora/g' text-generation-webui/custom_monkey_patch.py
-
-# Run the server
-WORKDIR /alpaca_lora_4bit/text-generation-webui
-CMD ["python", "-u", "server.py", "--listen", "--chat"]
@@ -1,105 +0,0 @@
-import os
-class Finetune4bConfig:
-    """Config holder for LLaMA 4bit finetuning
-    """
-    def __init__(self, dataset: str, ds_type: str,
-                 lora_out_dir: str, lora_apply_dir: str, resume_checkpoint: str,
-                 llama_q4_config_dir: str, llama_q4_model: str,
-                 mbatch_size: int, batch_size: int,
-                 epochs: int, lr: float,
-                 cutoff_len: int,
-                 lora_r: int, lora_alpha: int, lora_dropout: float,
-                 val_set_size: float,
-                 gradient_checkpointing: bool,
-                 gradient_checkpointing_ratio: float,
-                 warmup_steps: int, save_steps: int, save_total_limit: int, logging_steps: int,
-                 checkpoint: bool, skip: bool, verbose: bool,
-                 txt_row_thd: int, use_eos_token: bool, groupsize: int, v1: bool,
-                 local_rank: int, flash_attention: bool, backend: str
-                 ):
-        """
-        Args:
-            dataset (str): Path to dataset file
-            ds_type (str): Dataset structure format
-            lora_out_dir (str): Directory to place new LoRA
-            lora_apply_dir (str): Path to directory from which LoRA has to be applied before training
-            resume_checkpoint (str): Path to Specified checkpoint you want to resume.
-            llama_q4_config_dir (str): Path to the config.json, tokenizer_config.json, etc
-            llama_q4_model (str): Path to the quantized model in huggingface format
-            mbatch_size (int): Micro-batch size
-            batch_size (int): Batch size
-            epochs (int): Epochs
-            lr (float): Learning rate
-            cutoff_len (int): Cutoff length
-            lora_r (int): LoRA R
-            lora_alpha (int): LoRA Alpha
-            lora_dropout (float): LoRA Dropout
-            gradient_checkpointing (bool) : Use gradient checkpointing
-            gradient_checkpointing_ratio (float) : Gradient checkpoint ratio
-            val_set_size (int): Validation set size
-            warmup_steps (int): Warmup steps before training
-            save_steps (int): Save steps
-            save_total_limit (int): Save total limit
-            logging_steps (int): Logging steps
-            checkpoint (bool): Produce checkpoint instead of LoRA
-            skip (bool): Don't train model
-            verbose (bool): If output log of training
-            txt_row_thd (int): Custom row thd for txt file
-            use_eos_token (bool): Use Eos token instead of padding with 0
-            groupsize (int): Group size of V2 model
-            v1 (bool): v1 model flag
-            local_rank (int): local rank if using torch.distributed.launch
-            flash_attention (bool): Enables flash attention
-        """
-        self.dataset = dataset
-        self.ds_type = ds_type
-        self.lora_out_dir = lora_out_dir
-        self.lora_apply_dir = lora_apply_dir
-        self.resume_checkpoint = resume_checkpoint
-        self.llama_q4_config_dir = llama_q4_config_dir
-        self.llama_q4_model = llama_q4_model
-        self.mbatch_size = mbatch_size
-        self.batch_size = batch_size
-        self.gradient_accumulation_steps = self.batch_size // self.mbatch_size
-        self.epochs = epochs
-        self.lr = lr
-        self.cutoff_len = cutoff_len
-        self.lora_r = lora_r
-        self.lora_alpha = lora_alpha
-        self.lora_dropout = 0 if gradient_checkpointing else lora_dropout # should be 0 if gradient checkpointing is on
-        self.val_set_size = int(val_set_size) if val_set_size > 1.0 else float(val_set_size)
-        self.gradient_checkpointing = gradient_checkpointing
-        self.gradient_checkpointing_ratio = gradient_checkpointing_ratio
-        self.warmup_steps = warmup_steps
-        self.save_steps = save_steps
-        self.save_total_limit = save_total_limit
-        self.logging_steps = logging_steps
-        self.checkpoint = checkpoint
-        self.skip = skip
-        self.verbose = verbose
-        self.txt_row_thd = txt_row_thd
-        self.use_eos_token = use_eos_token
-        self.world_size = int(os.environ.get("WORLD_SIZE", 1))
-        self.local_rank = int(os.environ.get("LOCAL_RANK", local_rank))
-        self.ddp = self.world_size != 1
-        self.device_map = "auto" if not self.ddp else {"": self.local_rank}
-        if self.ddp:
-            self.gradient_accumulation_steps = self.gradient_accumulation_steps // self.world_size
-        self.groupsize = groupsize
-        self.v1 = v1
-        self.flash_attention = flash_attention
-        self.backend = backend
-
-
-    def __str__(self) -> str:
-        s = f"\nParameters:\n{'config':-^20}\n{self.dataset=}\n{self.ds_type=}\n{self.lora_out_dir=}\n{self.lora_apply_dir=}\n{self.llama_q4_config_dir=}\n{self.llama_q4_model=}\n\n" +\
-        f"{'training':-^20}\n" +\
-        f"{self.mbatch_size=}\n{self.batch_size=}\n{self.gradient_accumulation_steps=}\n{self.epochs=}\n{self.lr=}\n{self.cutoff_len=}\n" +\
-        f"{self.lora_r=}\n{self.lora_alpha=}\n{self.lora_dropout=}\n{self.val_set_size=}\n" +\
-        f"{self.gradient_checkpointing=}\n{self.gradient_checkpointing_ratio=}\n" +\
-        f"{self.warmup_steps=}\n{self.save_steps=}\n{self.save_total_limit=}\n" +\
-        f"{self.logging_steps=}\n" +\
-        f"{self.checkpoint=}\n{self.skip=}\n" +\
-        f"{self.world_size=}\n{self.ddp=}\n{self.device_map=}\n" +\
-        f"{self.groupsize=}\n{self.v1=}\n{self.backend=}\n"
-        return s.replace("self.", "")
@@ -1,26 +0,0 @@
-import torch
-
-
-class AMPWrapper:
-    
-    def __init__(self, model, options=None):
-        self.model = model
-        self.options = options
-        if self.options is None:
-            self.options = {'enabled': True, 'device_type': 'cuda'}
-        
-    def autocast_forward(self, *args, **kwargs):
-        with torch.amp.autocast(**self.options):
-            return self.model.non_autocast_forward(*args, **kwargs)
-    
-    def autocast_generate(self, *args, **kwargs):
-        with torch.amp.autocast(**self.options):
-            return self.model.non_autocast_generate(*args, **kwargs)
-    
-    def apply_forward(self):
-        self.model.non_autocast_forward = self.model.forward
-        self.model.forward = self.autocast_forward
-        
-    def apply_generate(self):
-        self.model.non_autocast_generate = self.model.generate
-        self.model.generate = self.autocast_generate
@@ -1,115 +0,0 @@
-import os
-import argparse
-from Finetune4bConfig import Finetune4bConfig
-
-def parse_commandline():
-    parser = argparse.ArgumentParser(
-        prog=__file__.split(os.path.sep)[-1],
-        description="Produce LoRA in 4bit training",
-        usage="%(prog)s [config] [training]\n\nAll arguments are optional"
-    )
-    
-    parser.add_argument("dataset", nargs="?",
-        default="./dataset.json", 
-        help="Path to dataset file. Default: %(default)s"
-    )
-    
-    parser_config = parser.add_argument_group("config")
-    parser_training = parser.add_argument_group("training")
-
-    # Config args group
-    parser_config.add_argument("--ds_type", choices=["txt", "alpaca", "gpt4all"], default="alpaca", required=False,
-        help="Dataset structure format. Default: %(default)s"
-    )
-    parser_config.add_argument("--lora_out_dir", default="alpaca_lora", required=False,
-        help="Directory to place new LoRA. Default: %(default)s"
-    )
-    parser_config.add_argument("--lora_apply_dir", default=None, required=False,
-        help="Path to directory from which LoRA has to be applied before training. Default: %(default)s"
-    )
-    parser_training.add_argument("--resume_checkpoint", default=None, required=False,
-        help="Resume training from specified checkpoint. Default: %(default)s"
-    )
-    parser_config.add_argument("--llama_q4_config_dir", default="./llama-13b-4bit/", required=False,
-        help="Path to the config.json, tokenizer_config.json, etc. Default: %(default)s"
-    )
-    parser_config.add_argument("--llama_q4_model", default="./llama-13b-4bit.pt", required=False,
-        help="Path to the quantized model in huggingface format. Default: %(default)s"
-    )
-
-    # Training args group
-    parser_training.add_argument("--mbatch_size", default=1, type=int, help="Micro-batch size. Default: %(default)s")
-    parser_training.add_argument("--batch_size", default=2, type=int, help="Batch size. Default: %(default)s")
-    parser_training.add_argument("--epochs", default=3, type=int, help="Epochs. Default: %(default)s")
-    parser_training.add_argument("--lr", default=2e-4, type=float, help="Learning rate. Default: %(default)s")
-    parser_training.add_argument("--cutoff_len", default=256, type=int, help="Default: %(default)s")
-    parser_training.add_argument("--lora_r", default=8, type=int, help="Default: %(default)s")
-    parser_training.add_argument("--lora_alpha", default=16, type=int, help="Default: %(default)s")
-    parser_training.add_argument("--lora_dropout", default=0.05, type=float, help="Default: %(default)s")
-    parser_training.add_argument("--grad_chckpt", action="store_true", required=False, help="Use gradient checkpoint. For 30B model. Default: %(default)s")
-    parser_training.add_argument("--grad_chckpt_ratio", default=1, type=float, help="Gradient checkpoint ratio. Default: %(default)s")
-    parser_training.add_argument("--val_set_size", default=0.2, type=float, help="Validation set size. Default: %(default)s")
-    parser_training.add_argument("--warmup_steps", default=50, type=int, help="Default: %(default)s")
-    parser_training.add_argument("--save_steps", default=50, type=int, help="Default: %(default)s")
-    parser_training.add_argument("--save_total_limit", default=3, type=int, help="Default: %(default)s")
-    parser_training.add_argument("--logging_steps", default=10, type=int, help="Default: %(default)s")
-    parser_training.add_argument("-c", "--checkpoint", action="store_true", help="Produce checkpoint instead of LoRA. Default: %(default)s")
-    parser_training.add_argument("--skip", action="store_true", help="Don't train model. Can be useful to produce checkpoint from existing LoRA. Default: %(default)s")
-    parser_training.add_argument("--verbose", action="store_true", help="If output log of training. Default: %(default)s")
-
-    # Data args
-    parser_training.add_argument("--txt_row_thd", default=-1, type=int, help="Custom thd for txt rows.")
-    parser_training.add_argument("--use_eos_token", default=1, type=int, help="Use eos token instead if padding with 0. enable with 1, disable with 0.")
-    
-    # V2 model support
-    parser_training.add_argument("--groupsize", type=int, default=-1, help="Groupsize of v2 model")
-    parser_training.add_argument("--v1", action="store_true", help="Use V1 model")
-
-    # Multi GPU Support
-    parser_training.add_argument("--local_rank", type=int, default=0, help="local rank if using torch.distributed.launch")
-    
-    # Flash Attention
-    parser_training.add_argument("--flash_attention", action="store_true", help="enables flash attention, can improve performance and reduce VRAM use")
-
-    # Train Backend
-    parser_training.add_argument("--backend", type=str, default='cuda', help="Backend to use. Triton or Cuda.")
-
-    return vars(parser.parse_args())
-
-
-def get_config() -> Finetune4bConfig:
-    args = parse_commandline()
-    return Finetune4bConfig(
-        dataset=args["dataset"], 
-        ds_type=args["ds_type"], 
-        lora_out_dir=args["lora_out_dir"], 
-        lora_apply_dir=args["lora_apply_dir"],
-        resume_checkpoint=args["resume_checkpoint"],
-        llama_q4_config_dir=args["llama_q4_config_dir"],
-        llama_q4_model=args["llama_q4_model"],
-        mbatch_size=args["mbatch_size"],
-        batch_size=args["batch_size"],
-        epochs=args["epochs"], 
-        lr=args["lr"],
-        cutoff_len=args["cutoff_len"],
-        lora_r=args["lora_r"], 
-        lora_alpha=args["lora_alpha"], 
-        lora_dropout=args["lora_dropout"],
-        val_set_size=args["val_set_size"],
-        gradient_checkpointing=args["grad_chckpt"],
-        gradient_checkpointing_ratio=args["grad_chckpt_ratio"],
-        warmup_steps=args["warmup_steps"],
-        save_steps=args["save_steps"],
-        save_total_limit=args["save_total_limit"],
-        logging_steps=args["logging_steps"],
-        checkpoint=args["checkpoint"],
-        skip=args["skip"],
-        verbose=args["verbose"],
-        txt_row_thd=args["txt_row_thd"],
-        use_eos_token=args["use_eos_token"]!=0,
-        groupsize=args["groupsize"],
-        v1=args["v1"],
-        local_rank=args["local_rank"],
-        flash_attention=args["flash_attention"],
-        backend=args["backend"],
-    )
@@ -1,292 +1,295 @@
-import matmul_utils_4bit as mm4b
-import torch
-import torch.nn as nn
-import time
-import math
-from torch.cuda.amp import custom_bwd, custom_fwd
-from colorama import init, Fore, Back, Style
-init(autoreset=True)
-
-
-class AutogradMatmul4bitCuda(torch.autograd.Function):
-
-    @staticmethod
-    @custom_fwd(cast_inputs=torch.float16)
-    def forward(ctx, x, qweight, scales, zeros, g_idx, bits, maxq):
-        ctx.save_for_backward(qweight, scales, zeros, g_idx)
-        if g_idx is None:
-            output = mm4b._matmul4bit_v1_recons(x, qweight, scales, zeros)
-        else:
-            output = mm4b._matmul4bit_v2_recons(x, qweight, scales, zeros, g_idx)
-        output = output.clone()
-        return output
-
-    @staticmethod
-    @custom_bwd
-    def backward(ctx, grad_output):
-        qweight, scales, zeros, g_idx = ctx.saved_tensors
-        if ctx.needs_input_grad[0]:
-            if g_idx is None:
-                grad = mm4b._matmul4bit_v1_recons(grad_output, qweight, scales, zeros, transpose=True)
-            else:
-                grad = mm4b._matmul4bit_v2_recons(grad_output, qweight, scales, zeros, g_idx, transpose=True)
-        return grad, None, None, None, None, None, None
-
-
-try:
-    import triton_utils as tu
-
-    class AutogradMatmul4bitTriton(torch.autograd.Function):
-
-        @staticmethod
-        @custom_fwd(cast_inputs=torch.float16)
-        def forward(ctx, x, qweight, scales, qzeros, g_idx, bits, maxq):
-            output = tu.triton_matmul(x, qweight, scales, qzeros, g_idx, bits, maxq)
-            ctx.save_for_backward(qweight, scales, qzeros, g_idx)
-            ctx.bits, ctx.maxq = bits, maxq
-            output = output.clone()
-            return output
-        
-        @staticmethod
-        @custom_bwd
-        def backward(ctx, grad_output):
-            qweight, scales, qzeros, g_idx = ctx.saved_tensors
-            bits, maxq = ctx.bits, ctx.maxq
-            grad_input = None
-
-            if ctx.needs_input_grad[0]:
-                grad_input = tu.triton_matmul_transpose(grad_output, qweight, scales, qzeros, g_idx, bits, maxq)
-            return grad_input, None, None, None, None, None, None
-    
-except ImportError:
-    print('Triton not found. Please run "pip install triton".')
-
-
-AutogradMatmul4bit = AutogradMatmul4bitCuda
-backend = 'cuda'
-
-
-def switch_backend_to(to_backend):
-    global AutogradMatmul4bit
-    global backend
-    if to_backend == 'cuda':
-        AutogradMatmul4bit = AutogradMatmul4bitCuda
-        backend = 'cuda'
-        print(Style.BRIGHT + Fore.GREEN + 'Using CUDA implementation.')
-    elif to_backend == 'triton':
-        # detect if AutogradMatmul4bitTriton is defined
-        if 'AutogradMatmul4bitTriton' not in globals():
-            raise ValueError('Triton not found. Please install triton_utils.')
-        AutogradMatmul4bit = AutogradMatmul4bitTriton
-        backend = 'triton'
-        print(Style.BRIGHT + Fore.GREEN + 'Using Triton implementation.')
-    else:
-        raise ValueError('Backend not supported.')
-
-
-def matmul4bit_with_backend(x, qweight, scales, qzeros, g_idx, bits, maxq):
-    if backend == 'cuda':
-        return mm4b.matmul4bit(x, qweight, scales, qzeros, g_idx)
-    elif backend == 'triton':
-        assert qzeros.dtype == torch.int32
-        return tu.triton_matmul(x, qweight, scales, qzeros, g_idx, bits, maxq)
-    else:
-        raise ValueError('Backend not supported.')
-
-
-# Assumes layer is perfectly divisible into 256 * 256 blocks
-class Autograd4bitQuantLinear(nn.Module):
-
-    def __init__(self, in_features, out_features, groupsize=-1, is_v1_model=False):
-        super().__init__()
-        bits = 4
-        self.in_features = in_features
-        self.out_features = out_features
-        self.bits = bits
-        self.maxq = 2 ** self.bits - 1
-        groupsize = groupsize if groupsize != -1 else in_features
-        self.groupsize = groupsize
-        self.is_v1_model = is_v1_model
-        if is_v1_model:
-            self.register_buffer('zeros', torch.empty((out_features, 1)))
-            self.register_buffer('scales', torch.empty((out_features, 1)))
-            self.g_idx = None
-        else:
-            self.register_buffer('qzeros',
-                                  torch.empty((math.ceil(in_features/groupsize), out_features // 256 * (bits * 8)), dtype=torch.int32)
-                                )
-            self.register_buffer('scales', torch.empty((math.ceil(in_features/groupsize), out_features)))
-            self.register_buffer('g_idx', torch.tensor([i // self.groupsize  for i in range(in_features)], dtype = torch.int32))
-        self.register_buffer('bias', torch.empty(out_features))
-        self.register_buffer(
-            'qweight', torch.empty((in_features // 256 * (bits * 8), out_features), dtype=torch.int32)
-        )
-
-
-    def forward(self, x):
-        if torch.is_grad_enabled():
-            out = AutogradMatmul4bit.apply(x, self.qweight, self.scales,
-                                           self.qzeros if not self.is_v1_model else self.zeros,
-                                           self.g_idx, self.bits, self.maxq)
-        else:
-            out = matmul4bit_with_backend(x, self.qweight, self.scales,
-                                          self.qzeros if not self.is_v1_model else self.zeros,
-                                          self.g_idx, self.bits, self.maxq)
-        out += self.bias
-        return out
-
-
-def make_quant_for_4bit_autograd(module, names, name='', groupsize=-1, is_v1_model=False):
-    if isinstance(module, Autograd4bitQuantLinear):
-        return
-    for attr in dir(module):
-        tmp = getattr(module, attr)
-        name1 = name + '.' + attr if name != '' else attr
-        if name1 in names:
-            setattr(
-                module, attr, Autograd4bitQuantLinear(tmp.in_features, tmp.out_features, groupsize=groupsize, is_v1_model=is_v1_model)
-            )
-    for name1, child in module.named_children():
-        make_quant_for_4bit_autograd(child, names, name + '.' + name1 if name != '' else name1, groupsize=groupsize, is_v1_model=is_v1_model)
-
-
-def model_to_half(model):
-    model.half()
-    for n, m in model.named_modules():
-        if isinstance(m, Autograd4bitQuantLinear):
-            if m.is_v1_model:
-                m.zeros = m.zeros.half()
-            m.scales = m.scales.half()
-            m.bias = m.bias.half()
-    print(Style.BRIGHT + Fore.YELLOW + 'Converted as Half.')
-
-
-def model_to_float(model):
-    model.float()
-    for n, m in model.named_modules():
-        if isinstance(m, Autograd4bitQuantLinear):
-            if m.is_v1_model:
-                m.zeros = m.zeros.float()
-            m.scales = m.scales.float()
-            m.bias = m.bias.float()
-    print(Style.BRIGHT + Fore.YELLOW + 'Converted as Float.')
-
-
-def find_layers(module, layers=[nn.Conv2d, nn.Linear], name=''):
-    if type(module) in layers:
-        return {name: module}
-    res = {}
-    for name1, child in module.named_children():
-        res.update(find_layers(
-            child, layers=layers, name=name + '.' + name1 if name != '' else name1
-        ))
-    return res
-
-
-def load_llama_model_4bit_low_ram(config_path, model_path, groupsize=-1, half=False, device_map="auto", seqlen=2048, is_v1_model=False):
-    import accelerate
-    from transformers import LlamaConfig, LlamaForCausalLM, LlamaTokenizer
-
-    print(Style.BRIGHT + Fore.CYAN + "Loading Model ...")
-    t0 = time.time()
-
-    with accelerate.init_empty_weights():
-        config = LlamaConfig.from_pretrained(config_path)
-        model = LlamaForCausalLM(config)
-        model = model.eval()
-        layers = find_layers(model)
-        for name in ['lm_head']:
-            if name in layers:
-                del layers[name]
-        make_quant_for_4bit_autograd(model, layers, groupsize=groupsize, is_v1_model=is_v1_model)
-    model = accelerate.load_checkpoint_and_dispatch(
-        model=model,
-        checkpoint=model_path,
-        device_map=device_map,
-        no_split_module_classes=["LlamaDecoderLayer"]
-    )
-
-    model.seqlen = seqlen
-
-    if half:
-        model_to_half(model)
-
-    tokenizer = LlamaTokenizer.from_pretrained(config_path)
-    tokenizer.truncation_side = 'left'
-
-    print(Style.BRIGHT + Fore.GREEN + f"Loaded the model in {(time.time()-t0):.2f} seconds.")
-
-    return model, tokenizer
-
-def load_llama_model_4bit_low_ram_and_offload(config_path, model_path, lora_path=None, groupsize=-1, seqlen=2048, max_memory=None, is_v1_model=False):
-    import accelerate
-    from transformers import LlamaConfig, LlamaForCausalLM, LlamaTokenizer
-
-    if max_memory is None:
-        max_memory = {0: '24Gib', 'cpu': '48Gib'}
-
-    print(Style.BRIGHT + Fore.CYAN + "Loading Model ...")
-    t0 = time.time()
-
-    with accelerate.init_empty_weights():
-        config = LlamaConfig.from_pretrained(config_path)
-        model = LlamaForCausalLM(config)
-        model = model.eval()
-        layers = find_layers(model)
-        for name in ['lm_head']:
-            if name in layers:
-                del layers[name]
-        make_quant_for_4bit_autograd(model, layers, groupsize=groupsize, is_v1_model=is_v1_model)
-    accelerate.load_checkpoint_in_model(model, checkpoint=model_path, device_map={'': 'cpu'})
-
-    # rotary_emb fix
-    for n, m in model.named_modules():
-        if 'rotary_emb' in n:
-            cos_cached = m.cos_cached.clone().cpu()
-            sin_cached = m.sin_cached.clone().cpu()
-            break
-
-    if lora_path is not None:
-        from peft import PeftModel
-        from peft.tuners.lora import Linear4bitLt
-        model = PeftModel.from_pretrained(model, lora_path, device_map={'': 'cpu'}, torch_dtype=torch.float32)
-        print(Style.BRIGHT + Fore.GREEN + '{} Lora Applied.'.format(lora_path))
-
-    model.seqlen = seqlen
-
-    print('Apply half ...')
-    for n, m in model.named_modules():
-        if isinstance(m, Autograd4bitQuantLinear) or ((lora_path is not None) and isinstance(m, Linear4bitLt)):
-            if m.is_v1_model:
-                m.zeros = m.zeros.half()
-            m.scales = m.scales.half()
-            m.bias = m.bias.half()
-
-    print('Dispatching model ...')
-    device_map = accelerate.infer_auto_device_map(model, max_memory=max_memory, no_split_module_classes=["LlamaDecoderLayer"])
-    model = accelerate.dispatch_model(model, device_map=device_map, offload_buffers=True, main_device=0)
-    torch.cuda.empty_cache()
-    print(Style.BRIGHT + Fore.YELLOW + 'Total {:.2f} Gib VRAM used.'.format(torch.cuda.memory_allocated() / 1024 / 1024))
-
-    # rotary_emb fix
-    for n, m in model.named_modules():
-        if 'rotary_emb' in n:
-            if getattr(m, '_hf_hook', None):
-                if isinstance(m._hf_hook, accelerate.hooks.SequentialHook):
-                    hooks = m._hf_hook.hooks
-                else:
-                    hooks = [m._hf_hook]
-                for hook in hooks:
-                    if hook.offload:
-                        if n + '.sin_cached' not in hook.weights_map.dataset.state_dict.keys():
-                            hook.weights_map.dataset.state_dict[n + '.sin_cached'] = sin_cached.clone().cpu()
-                            hook.weights_map.dataset.state_dict[n + '.cos_cached'] = cos_cached.clone().cpu()
-
-    tokenizer = LlamaTokenizer.from_pretrained(config_path)
-    tokenizer.truncation_side = 'left'
-
-    print(Style.BRIGHT + Fore.GREEN + f"Loaded the model in {(time.time()-t0):.2f} seconds.")
-
-    return model, tokenizer
-
-load_llama_model_4bit_low_ram_and_offload_to_cpu = load_llama_model_4bit_low_ram_and_offload
+"""
+from https://raw.githubusercontent.com/johnsmith0031/alpaca_lora_4bit/main/autograd_4bit.py
+"""
+import matmul_utils_4bit as mm4b
+import torch
+import torch.nn as nn
+import time
+import math
+from torch.cuda.amp import custom_bwd, custom_fwd
+from colorama import init, Fore, Back, Style
+init(autoreset=True)
+
+
+class AutogradMatmul4bitCuda(torch.autograd.Function):
+
+    @staticmethod
+    @custom_fwd(cast_inputs=torch.float16)
+    def forward(ctx, x, qweight, scales, zeros, g_idx, bits, maxq):
+        ctx.save_for_backward(qweight, scales, zeros, g_idx)
+        if g_idx is None:
+            output = mm4b._matmul4bit_v1_recons(x, qweight, scales, zeros)
+        else:
+            output = mm4b._matmul4bit_v2_recons(x, qweight, scales, zeros, g_idx)
+        output = output.clone()
+        return output
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, grad_output):
+        qweight, scales, zeros, g_idx = ctx.saved_tensors
+        if ctx.needs_input_grad[0]:
+            if g_idx is None:
+                grad = mm4b._matmul4bit_v1_recons(grad_output, qweight, scales, zeros, transpose=True)
+            else:
+                grad = mm4b._matmul4bit_v2_recons(grad_output, qweight, scales, zeros, g_idx, transpose=True)
+        return grad, None, None, None, None, None, None
+
+
+try:
+    import triton_utils as tu
+
+    class AutogradMatmul4bitTriton(torch.autograd.Function):
+
+        @staticmethod
+        @custom_fwd(cast_inputs=torch.float16)
+        def forward(ctx, x, qweight, scales, qzeros, g_idx, bits, maxq):
+            output = tu.triton_matmul(x, qweight, scales, qzeros, g_idx, bits, maxq)
+            ctx.save_for_backward(qweight, scales, qzeros, g_idx)
+            ctx.bits, ctx.maxq = bits, maxq
+            output = output.clone()
+            return output
+        
+        @staticmethod
+        @custom_bwd
+        def backward(ctx, grad_output):
+            qweight, scales, qzeros, g_idx = ctx.saved_tensors
+            bits, maxq = ctx.bits, ctx.maxq
+            grad_input = None
+
+            if ctx.needs_input_grad[0]:
+                grad_input = tu.triton_matmul_transpose(grad_output, qweight, scales, qzeros, g_idx, bits, maxq)
+            return grad_input, None, None, None, None, None, None
+    
+except ImportError:
+    print('Triton not found. Please run "pip install triton".')
+
+
+AutogradMatmul4bit = AutogradMatmul4bitCuda
+backend = 'cuda'
+
+
+def switch_backend_to(to_backend):
+    global AutogradMatmul4bit
+    global backend
+    if to_backend == 'cuda':
+        AutogradMatmul4bit = AutogradMatmul4bitCuda
+        backend = 'cuda'
+        print(Style.BRIGHT + Fore.GREEN + 'Using CUDA implementation.')
+    elif to_backend == 'triton':
+        # detect if AutogradMatmul4bitTriton is defined
+        if 'AutogradMatmul4bitTriton' not in globals():
+            raise ValueError('Triton not found. Please install triton_utils.')
+        AutogradMatmul4bit = AutogradMatmul4bitTriton
+        backend = 'triton'
+        print(Style.BRIGHT + Fore.GREEN + 'Using Triton implementation.')
+    else:
+        raise ValueError('Backend not supported.')
+
+
+def matmul4bit_with_backend(x, qweight, scales, qzeros, g_idx, bits, maxq):
+    if backend == 'cuda':
+        return mm4b.matmul4bit(x, qweight, scales, qzeros, g_idx)
+    elif backend == 'triton':
+        assert qzeros.dtype == torch.int32
+        return tu.triton_matmul(x, qweight, scales, qzeros, g_idx, bits, maxq)
+    else:
+        raise ValueError('Backend not supported.')
+
+
+# Assumes layer is perfectly divisible into 256 * 256 blocks
+class Autograd4bitQuantLinear(nn.Module):
+
+    def __init__(self, in_features, out_features, groupsize=-1, is_v1_model=False):
+        super().__init__()
+        bits = 4
+        self.in_features = in_features
+        self.out_features = out_features
+        self.bits = bits
+        self.maxq = 2 ** self.bits - 1
+        groupsize = groupsize if groupsize != -1 else in_features
+        self.groupsize = groupsize
+        self.is_v1_model = is_v1_model
+        if is_v1_model:
+            self.register_buffer('zeros', torch.empty((out_features, 1)))
+            self.register_buffer('scales', torch.empty((out_features, 1)))
+            self.g_idx = None
+        else:
+            self.register_buffer('qzeros',
+                                  torch.empty((math.ceil(in_features/groupsize), out_features // 256 * (bits * 8)), dtype=torch.int32)
+                                )
+            self.register_buffer('scales', torch.empty((math.ceil(in_features/groupsize), out_features)))
+            self.register_buffer('g_idx', torch.tensor([i // self.groupsize  for i in range(in_features)], dtype = torch.int32))
+        self.register_buffer('bias', torch.empty(out_features))
+        self.register_buffer(
+            'qweight', torch.empty((in_features // 256 * (bits * 8), out_features), dtype=torch.int32)
+        )
+
+
+    def forward(self, x):
+        if torch.is_grad_enabled():
+            out = AutogradMatmul4bit.apply(x, self.qweight, self.scales,
+                                           self.qzeros if not self.is_v1_model else self.zeros,
+                                           self.g_idx, self.bits, self.maxq)
+        else:
+            out = matmul4bit_with_backend(x, self.qweight, self.scales,
+                                          self.qzeros if not self.is_v1_model else self.zeros,
+                                          self.g_idx, self.bits, self.maxq)
+        out += self.bias
+        return out
+
+
+def make_quant_for_4bit_autograd(module, names, name='', groupsize=-1, is_v1_model=False):
+    if isinstance(module, Autograd4bitQuantLinear):
+        return
+    for attr in dir(module):
+        tmp = getattr(module, attr)
+        name1 = name + '.' + attr if name != '' else attr
+        if name1 in names:
+            setattr(
+                module, attr, Autograd4bitQuantLinear(tmp.in_features, tmp.out_features, groupsize=groupsize, is_v1_model=is_v1_model)
+            )
+    for name1, child in module.named_children():
+        make_quant_for_4bit_autograd(child, names, name + '.' + name1 if name != '' else name1, groupsize=groupsize, is_v1_model=is_v1_model)
+
+
+def model_to_half(model):
+    model.half()
+    for n, m in model.named_modules():
+        if isinstance(m, Autograd4bitQuantLinear):
+            if m.is_v1_model:
+                m.zeros = m.zeros.half()
+            m.scales = m.scales.half()
+            m.bias = m.bias.half()
+    print(Style.BRIGHT + Fore.YELLOW + 'Converted as Half.')
+
+
+def model_to_float(model):
+    model.float()
+    for n, m in model.named_modules():
+        if isinstance(m, Autograd4bitQuantLinear):
+            if m.is_v1_model:
+                m.zeros = m.zeros.float()
+            m.scales = m.scales.float()
+            m.bias = m.bias.float()
+    print(Style.BRIGHT + Fore.YELLOW + 'Converted as Float.')
+
+
+def find_layers(module, layers=[nn.Conv2d, nn.Linear], name=''):
+    if type(module) in layers:
+        return {name: module}
+    res = {}
+    for name1, child in module.named_children():
+        res.update(find_layers(
+            child, layers=layers, name=name + '.' + name1 if name != '' else name1
+        ))
+    return res
+
+
+def load_llama_model_4bit_low_ram(config_path, model_path, groupsize=-1, half=False, device_map="auto", seqlen=2048, is_v1_model=False):
+    import accelerate
+    from transformers import LlamaConfig, LlamaForCausalLM, LlamaTokenizer
+
+    print(Style.BRIGHT + Fore.CYAN + "Loading Model ...")
+    t0 = time.time()
+
+    with accelerate.init_empty_weights():
+        config = LlamaConfig.from_pretrained(config_path)
+        model = LlamaForCausalLM(config)
+        model = model.eval()
+        layers = find_layers(model)
+        for name in ['lm_head']:
+            if name in layers:
+                del layers[name]
+        make_quant_for_4bit_autograd(model, layers, groupsize=groupsize, is_v1_model=is_v1_model)
+    model = accelerate.load_checkpoint_and_dispatch(
+        model=model,
+        checkpoint=model_path,
+        device_map=device_map,
+        no_split_module_classes=["LlamaDecoderLayer"]
+    )
+
+    model.seqlen = seqlen
+
+    if half:
+        model_to_half(model)
+
+    tokenizer = LlamaTokenizer.from_pretrained(config_path)
+    tokenizer.truncation_side = 'left'
+
+    print(Style.BRIGHT + Fore.GREEN + f"Loaded the model in {(time.time()-t0):.2f} seconds.")
+
+    return model, tokenizer
+
+def load_llama_model_4bit_low_ram_and_offload(config_path, model_path, lora_path=None, groupsize=-1, seqlen=2048, max_memory=None, is_v1_model=False):
+    import accelerate
+    from transformers import LlamaConfig, LlamaForCausalLM, LlamaTokenizer
+
+    if max_memory is None:
+        max_memory = {0: '24Gib', 'cpu': '48Gib'}
+
+    print(Style.BRIGHT + Fore.CYAN + "Loading Model ...")
+    t0 = time.time()
+
+    with accelerate.init_empty_weights():
+        config = LlamaConfig.from_pretrained(config_path)
+        model = LlamaForCausalLM(config)
+        model = model.eval()
+        layers = find_layers(model)
+        for name in ['lm_head']:
+            if name in layers:
+                del layers[name]
+        make_quant_for_4bit_autograd(model, layers, groupsize=groupsize, is_v1_model=is_v1_model)
+    accelerate.load_checkpoint_in_model(model, checkpoint=model_path, device_map={'': 'cpu'})
+
+    # rotary_emb fix
+    for n, m in model.named_modules():
+        if 'rotary_emb' in n:
+            cos_cached = m.cos_cached.clone().cpu()
+            sin_cached = m.sin_cached.clone().cpu()
+            break
+
+    if lora_path is not None:
+        from peft import PeftModel
+        from peft.tuners.lora import Linear4bitLt
+        model = PeftModel.from_pretrained(model, lora_path, device_map={'': 'cpu'}, torch_dtype=torch.float32)
+        print(Style.BRIGHT + Fore.GREEN + '{} Lora Applied.'.format(lora_path))
+
+    model.seqlen = seqlen
+
+    print('Apply half ...')
+    for n, m in model.named_modules():
+        if isinstance(m, Autograd4bitQuantLinear) or ((lora_path is not None) and isinstance(m, Linear4bitLt)):
+            if m.is_v1_model:
+                m.zeros = m.zeros.half()
+            m.scales = m.scales.half()
+            m.bias = m.bias.half()
+
+    print('Dispatching model ...')
+    device_map = accelerate.infer_auto_device_map(model, max_memory=max_memory, no_split_module_classes=["LlamaDecoderLayer"])
+    model = accelerate.dispatch_model(model, device_map=device_map, offload_buffers=True, main_device=0)
+    torch.cuda.empty_cache()
+    print(Style.BRIGHT + Fore.YELLOW + 'Total {:.2f} Gib VRAM used.'.format(torch.cuda.memory_allocated() / 1024 / 1024))
+
+    # rotary_emb fix
+    for n, m in model.named_modules():
+        if 'rotary_emb' in n:
+            if getattr(m, '_hf_hook', None):
+                if isinstance(m._hf_hook, accelerate.hooks.SequentialHook):
+                    hooks = m._hf_hook.hooks
+                else:
+                    hooks = [m._hf_hook]
+                for hook in hooks:
+                    if hook.offload:
+                        if n + '.sin_cached' not in hook.weights_map.dataset.state_dict.keys():
+                            hook.weights_map.dataset.state_dict[n + '.sin_cached'] = sin_cached.clone().cpu()
+                            hook.weights_map.dataset.state_dict[n + '.cos_cached'] = cos_cached.clone().cpu()
+
+    tokenizer = LlamaTokenizer.from_pretrained(config_path)
+    tokenizer.truncation_side = 'left'
+
+    print(Style.BRIGHT + Fore.GREEN + f"Loaded the model in {(time.time()-t0):.2f} seconds.")
+
+    return model, tokenizer
+
+load_llama_model_4bit_low_ram_and_offload_to_cpu = load_llama_model_4bit_low_ram_and_offload
@@ -1,167 +0,0 @@
-#https://github.com/fpgaminer/GPTQ-triton
-"""
-Mostly the same as the autotuner in Triton, but with a few changes like using 40 runs instead of 100.
-"""
-
-import builtins
-import math
-import time
-from typing import Dict
-
-import triton
-
-
-class Autotuner(triton.KernelInterface):
-	def __init__(self, fn, arg_names, configs, key, reset_to_zero, prune_configs_by: Dict = None, nearest_power_of_two: bool = False):
-		'''
-		:param prune_configs_by: a dict of functions that are used to prune configs, fields:
-			'perf_model': performance model used to predicate running time with different configs, returns running time
-			'top_k': number of configs to bench
-			'prune_num_stages_by'(optional): a function used to prune num_stages. It take configs:List[Config] as its input, and returns pruned configs.
-			'nearest_power_of_two'(optional): whether to round key arguments to the nearest power of two when caching tuning results
-		'''
-		if not configs:
-			self.configs = [triton.Config({}, num_warps=4, num_stages=2)]
-		else:
-			self.configs = configs
-		self.key_idx = [arg_names.index(k) for k in key]
-		self.nearest_power_of_two = nearest_power_of_two
-		self.cache = {}
-		# hook to reset all required tensor to zeros before relaunching a kernel
-		self.hook = lambda args: 0
-		if reset_to_zero is not None:
-			self.reset_idx = [arg_names.index(k) for k in reset_to_zero]
-
-			def _hook(args):
-				for i in self.reset_idx:
-					args[i].zero_()
-			self.hook = _hook
-		self.arg_names = arg_names
-		# prune configs
-		if prune_configs_by:
-			perf_model, top_k = prune_configs_by['perf_model'], prune_configs_by['top_k']
-			if 'early_config_prune' in prune_configs_by:
-				early_config_prune = prune_configs_by['early_config_prune']
-		else:
-			perf_model, top_k, early_config_prune = None, None, None
-		self.perf_model, self.configs_top_k = perf_model, top_k
-		self.early_config_prune = early_config_prune
-		self.fn = fn
-
-	def _bench(self, *args, config, **meta):
-		# check for conflicts, i.e. meta-parameters both provided
-		# as kwargs and by the autotuner
-		conflicts = meta.keys() & config.kwargs.keys()
-		if conflicts:
-			raise ValueError(
-				f"Conflicting meta-parameters: {', '.join(conflicts)}."
-				" Make sure that you don't re-define auto-tuned symbols."
-			)
-		# augment meta-parameters with tunable ones
-		current = dict(meta, **config.kwargs)
-
-		def kernel_call():
-			if config.pre_hook:
-				config.pre_hook(self.nargs)
-			self.hook(args)
-			self.fn.run(*args, num_warps=config.num_warps, num_stages=config.num_stages, **current)
-		try:
-			# In testings using only 40 reps seems to be close enough and it appears to be what PyTorch uses
-			# PyTorch also sets fast_flush to True, but I didn't see any speedup so I'll leave the default
-			return triton.testing.do_bench(kernel_call, rep=40)
-		except triton.compiler.OutOfResources:
-			return float('inf')
-
-	def run(self, *args, **kwargs):
-		self.nargs = dict(zip(self.arg_names, args))
-		if len(self.configs) > 1:
-			key = tuple(args[i] for i in self.key_idx)
-
-			# This reduces the amount of autotuning by rounding the keys to the nearest power of two
-			# In my testing this gives decent results, and greatly reduces the amount of tuning required
-			if self.nearest_power_of_two:
-				key = tuple([2 ** int(math.log2(x) + 0.5) for x in key])
-			
-			if key not in self.cache:
-				# prune configs
-				pruned_configs = self.prune_configs(kwargs)
-				bench_start = time.time()
-				timings = {config: self._bench(*args, config=config, **kwargs)
-							for config in pruned_configs}
-				bench_end = time.time()
-				self.bench_time = bench_end - bench_start
-				self.cache[key] = builtins.min(timings, key=timings.get)
-				self.hook(args)
-				self.configs_timings = timings
-			config = self.cache[key]
-		else:
-			config = self.configs[0]
-		self.best_config = config
-		if config.pre_hook is not None:
-			config.pre_hook(self.nargs)
-		return self.fn.run(*args, num_warps=config.num_warps, num_stages=config.num_stages, **kwargs, **config.kwargs)
-
-	def prune_configs(self, kwargs):
-		pruned_configs = self.configs
-		if self.early_config_prune:
-			pruned_configs = self.early_config_prune(self.configs, self.nargs)
-		if self.perf_model:
-			top_k = self.configs_top_k
-			if isinstance(top_k, float) and top_k <= 1.0:
-				top_k = int(len(self.configs) * top_k)
-			if len(pruned_configs) > top_k:
-				est_timing = {
-					config: self.perf_model(**self.nargs, **kwargs, **config.kwargs, num_stages=config.num_stages,
-											num_warps=config.num_warps)
-					for config in pruned_configs
-				}
-				pruned_configs = sorted(est_timing.keys(), key=lambda x: est_timing[x])[:top_k]
-		return pruned_configs
-
-	def warmup(self, *args, **kwargs):
-		self.nargs = dict(zip(self.arg_names, args))
-		for config in self.prune_configs(kwargs):
-			self.fn.warmup(
-				*args,
-				num_warps=config.num_warps,
-				num_stages=config.num_stages,
-				**kwargs,
-				**config.kwargs,
-			)
-		self.nargs = None
-
-
-def autotune(configs, key, prune_configs_by=None, reset_to_zero=None, nearest_power_of_two=False):
-	"""
-	Decorator for auto-tuning a :code:`triton.jit`'d function.
-	.. highlight:: python
-	.. code-block:: python
-		@triton.autotune(configs=[
-			triton.Config(meta={'BLOCK_SIZE': 128}, num_warps=4),
-			triton.Config(meta={'BLOCK_SIZE': 1024}, num_warps=8),
-			],
-			key=['x_size'] # the two above configs will be evaluated anytime
-							# the value of x_size changes
-		)
-		@triton.jit
-		def kernel(x_ptr, x_size, **META):
-			BLOCK_SIZE = META['BLOCK_SIZE']
-	:note: When all the configurations are evaluated, the kernel will run multiple time.
-			This means that whatever value the kernel updates will be updated multiple times.
-			To avoid this undesired behavior, you can use the `reset_to_zero` argument, which
-			reset the value of the provided tensor to `zero` before running any configuration.
-	:param configs: a list of :code:`triton.Config` objects
-	:type configs: list[triton.Config]
-	:param key: a list of argument names whose change in value will trigger the evaluation of all provided configs.
-	:type key: list[str]
-	:param prune_configs_by: a dict of functions that are used to prune configs, fields:
-		'perf_model': performance model used to predicate running time with different configs, returns running time
-		'top_k': number of configs to bench
-		'early_config_prune'(optional): a function used to do early prune (eg, num_stages). It take configs:List[Config] as its input, and returns pruned configs.
-	:param reset_to_zero: a list of argument names whose value will be reset to zero before evaluating any configs.
-	:type reset_to_zero: list[str]
-	"""
-	def decorator(fn):
-		return Autotuner(fn, fn.arg_names, configs, key, reset_to_zero, prune_configs_by, nearest_power_of_two)
-
-	return decorator
@@ -1,106 +0,0 @@
-The alpaca (Lama pacos) is a species of South American camelid mammal. It is similar to, and often confused with, the llama. However, alpacas are often noticeably smaller than llamas. The two animals are closely related and can successfully crossbreed. Both species are believed to have been domesticated from their wild relatives, the vicuña and guanaco. There are two breeds of alpaca: the Suri alpaca and the Huacaya alpaca.
-
-Alpacas are kept in herds that graze on the level heights of the Andes of Southern Peru, Western Bolivia, Ecuador, and Northern Chile at an altitude of 3,500 to 5,000 metres (11,000 to 16,000 feet) above sea level.[1] Alpacas are considerably smaller than llamas, and unlike llamas, they were not bred to be working animals, but were bred specifically for their fiber.
-
-Alpaca fiber is used for making knitted and woven items, similar to sheep's wool. These items include blankets, sweaters, hats, gloves, scarves, a wide variety of textiles, and ponchos, in South America, as well as sweaters, socks, coats, and bedding in other parts of the world. The fiber comes in more than 52 natural colors as classified in Peru, 12 as classified in Australia, and 16 as classified in the United States.
-
-Alpacas communicate through body language. The most common is spitting to show dominance[2] when they are in distress, fearful, or feel agitated. Male alpacas are more aggressive than females, and tend to establish dominance within their herd group. In some cases, alpha males will immobilize the head and neck of a weaker or challenging male in order to show their strength and dominance.
-
-In the textile industry, "alpaca" primarily refers to the hair of Peruvian alpacas, but more broadly it refers to a style of fabric originally made from alpaca hair, such as mohair, Icelandic sheep wool, or even high-quality wool from other breeds of sheep. In trade, distinctions are made between alpacas and the several styles of mohair and luster.[3]
-
-An adult alpaca generally is between 81 and 99 centimetres (32 and 39 inches) in height at the shoulders (withers). They usually weigh between 48 and 90 kilograms (106 and 198 pounds).[4] Raised in the same conditions, the difference in weight can be small with males weighting around 22.3 kilograms (49 lb 3 oz) and females 21.3 kilograms (46 lb 15 oz).[5]
-
-The relationship between alpacas and vicuñas was disputed for many years. In the 18th and 19th centuries, the four South American lamoid species were assigned scientific names. At that time, the alpaca was assumed to be descended from the llama, ignoring similarities in size, fleece and dentition between the alpaca and the vicuña. Classification was complicated by the fact that all four species of South American camelid can interbreed and produce fertile offspring.[6] The advent of DNA technology made a more accurate classification possible.
-
-In 2001, the alpaca genus classification changed from Lama pacos to Vicugna pacos, following the presentation of a paper[7] on work by Miranda Kadwell et al. on alpaca DNA to the Royal Society showing the alpaca is descended from the vicuña, not the guanaco.
-
-Alpacas were domesticated thousands of years ago. The Moche people of Northern Peru often used alpaca images in their art.[8] There are no known wild alpacas, and its closest living relative, the vicuña (also native to South America), is the wild ancestor of the alpaca.
-
-The family Camelidae first appeared in Americas 40–45 million years ago, during the Eocene period, from the common ancestor, Protylopus. The descendants divided into Camelini and Lamini tribes, taking different migratory patterns to Asia and South America, respectively. Although the camelids became extinct in North America around 3 million years ago, it flourished in the South with the species we see today.[9] It was not until 2–5 million years ago, during the Pliocene, that the genus Hemiauchenia of the tribe Lamini split into Palaeolama and Lama; the latter would then split again into Lama and Vicugna upon migrating down to South America.
-
-Remains of vicuña and guanaco have been found throughout Peru for around 12,000 years. Their domesticated counterparts, the llama and alpacas, have been found mummified in the Moquegua valley, in the south of Peru, dating back 900 to 1000 years. Mummies found in this region show two breeds of alpacas. More precise analysis of bone and teeth of these mummies has demonstrated that alpacas were domesticated from the Vicugna vicugna. Other research, considering the behavioral and morphological characteristics of alpacas and their wild counterparts, seems to indicate that alpacas could find their origins in Lama guanicoe as well as Vicugna vicugna, or even a hybrid of both.
-
-Genetic analysis shows a different picture of the origins of the alpaca. Analysis of mitochondrial DNA shows that most alpacas have guanaco mtDNA, and many also have vicuña mtDNA. But microsatellite data shows that alpaca DNA is much more similar to vicuña DNA than to guanaco DNA. This suggests that alpacas are descendants of the Vicugna vicugna, not of the Lama guanicoe. The discrepancy with mtDNA seems to be a result of the fact that mtDNA is only transmitted by the mother, and recent husbandry practices have caused hybridization between llamas (which primarily carry guanaco DNA) and alpacas. To the extent that many of today's domestic alpacas are the result of male alpacas bred to female llamas, this would explain the mtDNA consistent with guanacos. This situation has led to attempts to reclassify the alpaca as Vicugna pacos.[7]
-
-The alpaca comes in two breeds, Suri and Huacaya, based on their fibers rather than scientific or European classifications.
-
-Huacaya alpacas are the most commonly found, constituting about 90% of the population.[10] The Huacaya alpaca is thought to have originated in post-colonial Peru. This is due to their thicker fleece which makes them more suited to survive in the higher altitudes of the Andes after being pushed into the highlands of Peru with the arrival of the Spanish.[11][better source needed]
-
-Suri alpacas represent a smaller portion of the total alpaca population, around 10%.[10] They are thought to have been more prevalent in pre-Columbian Peru since they could be kept at a lower altitude where a thicker fleece was not needed for harsh weather conditions.[11][better source needed]
-
-Alpacas are social herd animals that live in family groups, consisting of a territorial alpha male, females, and their young ones. Alpacas warn the herd about intruders by making sharp, noisy inhalations that sound like a high-pitched bray. The herd may attack smaller predators with their front feet and can spit and kick. Their aggression towards members of the canid family (coyotes, foxes, dogs etc.) is exploited when alpacas are used as guard llamas for guarding sheep.[12][13]
-
-Alpacas can sometimes be aggressive, but they can also be very gentle, intelligent, and extremely observant. For the most part, alpacas are very quiet, but male alpacas are more energetic when they get involved in fighting with other alpacas.[14] When they prey, they are cautious but also nervous when they feel any type of threat. They can feel threatened when a person or another alpaca comes up from behind them.[15][better source needed]
-
-Alpacas set their own boundaries of "personal space" within their families and groups.[16] They make a hierarchy in some sense, and each alpaca is aware of the dominant animals in each group.[14] Body language is the key to their communication. It helps to maintain their order. One example of their body communication includes a pose named broadside, where their ears are pulled back and they stand sideways. This pose is used when male alpacas are defending their territory.[2]
-
-When they are young, they tend to follow larger objects and to sit near or under them. An example of this is a baby alpaca with its mother. This can also apply when an alpaca passes by an older alpaca.[16]
-
-Training
-Alpacas are generally very trainable and usually respond to reward, most commonly in the form of food. They can usually be petted without getting agitated, especially if one avoids petting the head or neck. Alpacas are usually quite easy to herd, even in large groups. However, during herding, it is recommended for the handler to approach the animals slowly and quietly, as failing to do so can result in danger for both the animals and the handler.[17]
-
-Alpacas and llamas have started showing up in U.S. nursing homes and hospitals as trained, certified therapy animals. The Mayo Clinic says animal-assisted therapy can reduce pain, depression, anxiety, and fatigue. This type of animal therapy is growing in popularity, and there are several organizations throughout the United States that participate.[18]
-
-Spitting
-Not all alpacas spit, but all are capable of doing so. "Spit" is somewhat euphemistic; occasionally the projectile contains only air and a little saliva, although alpacas commonly bring up acidic stomach contents (generally a green, grassy mix) and project it onto their chosen targets. Spitting is mostly reserved for other alpacas, but an alpaca will also occasionally spit at a human.
-
-Spitting can result in what is called "sour mouth". Sour mouth is characterized by "a loose-hanging lower lip and a gaping mouth."[19]
-
-Alpacas can spit for several reasons. A female alpaca spits when she is not interested in a male alpaca, typically when she thinks that she is already impregnated. Both sexes of alpaca keep others away from their food, or anything they have their eyes on. Most give a slight warning before spitting by blowing air out and raising their heads, giving their ears a "pinned" appearance.[16]
-
-Alpacas can spit up to ten feet if they need to. For example, if another animal does not back off, the alpaca will throw up its stomach contents, resulting in a lot of spit.[20]
-
-Some signs of stress which can lead to their spitting habits include: humming, a wrinkle under their eye, drooling, rapid breathing, and stomping their feet. When alpacas show any sign of interest or alertness, they tend to sniff their surroundings, watch closely, or stand quietly in place and stare.[20]
-
-When it comes to reproduction, they spit because it is a response triggered by the progesterone levels being increased, which is associated with ovulation.[21]
-
-Hygiene
-Alpacas use a communal dung pile,[22] where they do not graze. This behaviour tends to limit the spread of internal parasites. Generally, males have much tidier, and fewer dung piles than females, which tend to stand in a line and all go at once. One female approaches the dung pile and begins to urinate and/or defecate, and the rest of the herd often follows. Alpaca waste is collected and used as garden fertilizer or even natural fertilizer.[2]
-
-Because of their preference for using a dung pile for excreting bodily waste, some alpacas have been successfully house-trained.[23]
-
-Alpacas develop dental hygiene problems which affect their eating and behavior. Warning signs include protracted chewing while eating, or food spilling out of their mouths. Poor body condition and sunken cheeks are also telltales of dental problems.
-
-Alpacas make a variety of sounds:
-
-Humming: When alpacas are born, the mother and baby hum constantly. They also hum as a sign of distress, especially when they are separated from their herd. Alpacas may also hum when curious, happy, worried or cautious.
-Snorting: Alpacas snort when another alpaca is invading its space.
-Grumbling: Alpacas grumble to warn each other. For example, when one is invading another's personal space, it sounds like gurgling.
-Clucking: Similar to a hen's cluck, alpacas cluck when a mother is concerned for her cria. Male alpacas cluck to signal friendly behavior.[2]
-Screaming: Their screams are extremely deafening and loud. They will scream when they are not handled correctly or when they are being attacked by a potential enemy.
-Screeching: A bird-like cry, presumably intended to terrify the opponent. This sound is typically used by male alpacas when they are in a fight over dominance. When a female screeches, it is more of a growl when she is angry.
-
-Females are induced ovulators;[24] meaning the act of mating and the presence of semen causes them to ovulate. Females usually conceive after just one breeding, but occasionally do have trouble conceiving. Artificial insemination is technically difficult, expensive and not common, but it can be accomplished. Embryo transfer is more widespread.
-
-A male is usually ready to mate for the first time between two and three years of age. It is not advisable to allow a young female to be bred until she is mature and has reached two-thirds of her mature weight. Over-breeding a young female before conception is possibly a common cause of uterine infections. As the age of maturation varies greatly between individuals, it is usually recommended that novice breeders wait until females are 18 months of age or older before initiating breeding.[25]
-
-Alpacas can breed at any time throughout the year but it is more difficult to breed in the winter. Most breed during autumn or late spring. The most popular way to have alpacas mate is pen mating. Pen mating is when they move both the female and the desired male into a pen. Another way is paddock mating where one male alpaca is let loose in the paddock with several female alpacas.
-
-The gestation period is, on average, 11.5 months, and usually results in a single offspring, or cria. Twins are rare, occurring about once per 1000 deliveries.[26] Cria are generally between 15 and 19 pounds, and are standing 30 to 90 minutes after birth.[27] After a female gives birth, she is generally receptive to breeding again after about two weeks. Crias may be weaned through human intervention at about six months old and 60 pounds, but many breeders prefer to allow the female to decide when to wean her offspring; they can be weaned earlier or later depending on their size and emotional maturity.
-
-The average lifespan of an alpaca is between 15–20 years, and the longest-lived alpaca on record is 27 years.[28]
-
-Cattle tuberculosis can also infect alpacas: Mycobacterium bovis also causes TB in this species worldwide.[29] Krajewska‐Wędzina et al., 2020 detect M. bovis in individuals traded from the United Kingdom to Poland.[29] To accomplish this they develop a seroassay which correctly identifies positive subjects which are false negative for a common skin test.[29] Krajewska‐Wędzina et al. also find that alpacas are unusual in mounting a competent early-infection immune response.[29] Bernitz et al., 2021 believe this to generalise to all camelids.[29]
-
-Alpacas can be found throughout most of South America.[30] They typically live in temperate conditions in the mountains with high altitudes.
-
-They are easy to care for since they are not limited to a specific type of environment. Animals such as flamingos, condors, spectacled bears, mountain lions, coyotes, llamas, and sheep live near alpacas when they are in their natural habitat.
-
-Alpacas are native to Peru, but can be found throughout the globe in captivity.[30] Peru currently has the largest alpaca population, with over half the world's animals.[31] The population declined drastically after the Spanish Conquistadors invaded the Andes mountains in 1532, after which 98% of the animals were destroyed. The Spanish also brought with them diseases that were fatal to alpacas.[32]
-
-European conquest forced the animals to move higher into the mountains,[how?] which remained there permanently. Although alpacas had almost been wiped out completely, they were rediscovered sometime during the 19th century by Europeans. After finding uses for them, the animals became important to societies during the industrial revolution.[33]
-
-Nuzzle and Scratch was a British children's television programme featuring two fictional alpacas that was first broadcast between 2008 and 2011.[34]
-
-Interest in alpacas grew as a result of Depp v. Heard, the 2022 trial in which Johnny Depp sued Amber Heard for defamation in Virginia after Heard wrote an op-ed saying she was a public victim of domestic violence. Depp testified, under oath, that he would not make another Pirates of the Caribbean film for "300 million dollars and a million alpacas".[35][36][37]
-
-Alpacas chew their food which ends up being mixed with their cud and saliva and then they swallow it. Alpacas usually eat 1.5% of their body weight daily for normal growth.[38] They mainly need pasture grass, hay, or silage but some may also need supplemental energy and protein foods and they will also normally try to chew on almost anything (e.g. empty bottle). Most alpaca ranchers rotate their feeding grounds so the grass can regrow and fecal parasites may die before reusing the area. Pasture grass is a great source of protein. When seasons change, the grass loses or gains more protein. For example, in the spring, the pasture grass has about 20% protein while in the summer, it only has 6%.[38] They need more energy supplements in the winter to produce body heat and warmth. They get their fiber from hay or from long stems which provides them with vitamin E. Green grass contains vitamin A and E.
-
-Alpacas can eat natural unfertilized grass; however, ranchers can also supplement grass with low-protein grass hay. To provide selenium and other necessary vitamins, ranchers will feed their domestic alpacas a daily dose of grain to provide additional nutrients that are not fully obtained from their primary diet.[39] Alpacas may obtain the necessary vitamins in their native grazing ranges.
-
-Alpacas, like other camelids, have a three-chambered stomach; combined with chewing cud, this three-chambered system allows maximum extraction of nutrients from low-quality forages. Alpacas are not ruminants, pseudo-ruminants, or modified ruminants, as there are many differences between the anatomy and physiology of a camelid and a ruminant stomach.[40]
-
-Alpacas will chew their food in a figure eight motion, swallow the food, and then pass it into one of the stomach's chambers. The first and second chambers (called C1 and C2) are anaerobic fermentation chambers where the fermentation process begins. The alpaca will further absorb nutrients and water in the first part of the third chamber. The end of the third chamber (called C3) is where the stomach secretes acids to digest food and is the likely place where an alpaca will have ulcers if stressed.
-
-Many plants are poisonous to the alpaca, including the bracken fern, Madagascar ragwort, oleander, and some azaleas. In common with similar livestock, others include acorns, African rue, agave, amaryllis, autumn crocus, bear grass, broom snakeweed, buckwheat, ragweed, buttercups, calla lily, orange tree foliage, carnations, castor beans, and many others.[41]
-
@@ -1,178 +0,0 @@
-"""
-    llama-4b trainer with support of Stanford Alpaca-like JSON datasets (short for SAD)
-    Intended to use with https://github.com/johnsmith0031/alpaca_lora_4bit
-
-    SAD structure:
-    [
-        {
-            "instruction": "Give null hypothesis",
-            "input": "6 subjects were given a drug (treatment group) and an additional 6 subjects a placebo (control group).",
-            "output": "Drug is equivalent of placebo"
-        },
-        {
-            "instruction": "What does RNA stand for?",
-            "input": "",
-            "output": "RNA stands for ribonucleic acid."
-        }
-    ]
-"""
-# Early load config to replace attn if needed
-from arg_parser import get_config
-ft_config = get_config()
-
-if ft_config.flash_attention:
-    from monkeypatch.llama_flash_attn_monkey_patch import replace_llama_attn_with_flash_attn
-    replace_llama_attn_with_flash_attn()
-
-import autograd_4bit
-if ft_config.backend.lower() == 'triton':
-    autograd_4bit.switch_backend_to('triton')
-else:
-    autograd_4bit.switch_backend_to('cuda')
-
-import sys
-
-import peft
-import peft.tuners.lora
-assert peft.tuners.lora.is_gptq_available()
-
-import torch
-import transformers
-from autograd_4bit import load_llama_model_4bit_low_ram
-from peft import LoraConfig, get_peft_model, get_peft_model_state_dict, PeftModel
-
-# ! Config
-import train_data
-
-# * Show loaded parameters
-if ft_config.local_rank == 0:
-    print(f"{ft_config}\n")
-
-if ft_config.gradient_checkpointing:
-    print('Disable Dropout.')
-
-# Load Basic Model
-model, tokenizer = load_llama_model_4bit_low_ram(ft_config.llama_q4_config_dir,
-                                                  ft_config.llama_q4_model,
-                                                  device_map=ft_config.device_map,
-                                                  groupsize=ft_config.groupsize,
-                                                  is_v1_model=ft_config.v1)
-
-# Config Lora
-lora_config = LoraConfig(
-    r=ft_config.lora_r,
-    lora_alpha=ft_config.lora_alpha,
-    target_modules=["q_proj", "v_proj"],
-    lora_dropout=ft_config.lora_dropout,
-    bias="none",
-    task_type="CAUSAL_LM",
-)
-if ft_config.lora_apply_dir is None:
-    model = get_peft_model(model, lora_config)
-else:
-    device_map = ft_config.device_map
-    if ft_config.ddp:
-        device_map = {'': 0}
-    else:
-        if torch.cuda.device_count() > 1:
-            device_map = "auto"
-        else:
-            device_map = {'': 0}
-    print('Device map for lora:', device_map)
-    model = PeftModel.from_pretrained(model, ft_config.lora_apply_dir, device_map=device_map, torch_dtype=torch.float32)
-    print(ft_config.lora_apply_dir, 'loaded')
-
-
-# Scales to half
-print('Fitting 4bit scales and zeros to half')
-for n, m in model.named_modules():
-    if '4bit' in str(type(m)):
-        if m.groupsize == -1:
-            m.zeros = m.zeros.half()
-        m.scales = m.scales.half()
-
-# Set tokenizer
-tokenizer.pad_token_id = 0
-
-if not ft_config.skip:
-    # Load Data
-    data = None
-    if ft_config.ds_type == "txt" and not ft_config.skip:
-        #### LLaMa
-        data = train_data.TrainTxt(ft_config.dataset, ft_config.val_set_size, tokenizer, ft_config.cutoff_len)
-    elif ft_config.ds_type == "alpaca" and not ft_config.skip:
-        #### Stanford Alpaca-like Data
-        data = train_data.TrainSAD(ft_config.dataset, ft_config.val_set_size, tokenizer, ft_config.cutoff_len)
-    elif ft_config.ds_type == "gpt4all" and not ft_config.skip:
-        #### GPT4All Data
-        data = train_data.TrainGPT4All(ft_config.dataset, ft_config.val_set_size, tokenizer, ft_config.cutoff_len)
-    else:
-        raise NotImplementedError("ERROR: Unknown dataset format")
-    data.prepare_data(thd=ft_config.txt_row_thd, use_eos_token=ft_config.use_eos_token)
-    ####
-
-    # Use gradient checkpointing
-    if ft_config.gradient_checkpointing:
-        print('Applying gradient checkpointing ...')
-        from gradient_checkpointing import apply_gradient_checkpointing
-        apply_gradient_checkpointing(model, checkpoint_ratio=ft_config.gradient_checkpointing_ratio)
-
-    # Disable Trainer's DataParallel for multigpu
-    if not ft_config.ddp and torch.cuda.device_count() > 1:
-        model.is_parallelizable = True
-        model.model_parallel = True
-
-    training_arguments = transformers.TrainingArguments(
-        per_device_train_batch_size=ft_config.mbatch_size,
-        gradient_accumulation_steps=ft_config.gradient_accumulation_steps,
-        warmup_steps=ft_config.warmup_steps,
-        optim="adamw_torch",
-        num_train_epochs=ft_config.epochs,
-        learning_rate=ft_config.lr,
-        fp16=True,
-        logging_steps=ft_config.logging_steps,
-        evaluation_strategy="no",
-        save_strategy="steps",
-        eval_steps=None,
-        save_steps=ft_config.save_steps,
-        output_dir=ft_config.lora_out_dir,
-        save_total_limit=ft_config.save_total_limit,
-        load_best_model_at_end=False,
-        ddp_find_unused_parameters=False if ft_config.ddp else None,
-    )
-
-    trainer = transformers.Trainer(
-        model=model,
-        train_dataset=data.train_data,
-        eval_dataset=data.val_data,
-        args=training_arguments,
-        data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
-    )
-    model.config.use_cache = False
-
-    # Set Model dict
-    old_state_dict = model.state_dict
-    model.state_dict = (
-        lambda self, *_, **__: get_peft_model_state_dict(self, old_state_dict())
-    ).__get__(model, type(model))
-
-    # Set Verbose
-    if ft_config.verbose:
-        transformers.logging.set_verbosity_info()
-
-    # Run Trainer
-    if ft_config.resume_checkpoint:
-        print('Resuming from {} ...'.format(ft_config.resume_checkpoint))
-        trainer.train(ft_config.resume_checkpoint)
-    else:
-        trainer.train()
-
-    print('Train completed.')
-
-# Save Model
-model.save_pretrained(ft_config.lora_out_dir)
-
-if ft_config.checkpoint:
-    print("Warning: Merge model + LoRA and save the whole checkpoint not implemented yet.")
-
-print('Model Saved.')
@@ -1,61 +0,0 @@
-from transformers.models.llama.modeling_llama import LlamaDecoderLayer
-from torch.utils.checkpoint import checkpoint
-from torch.autograd import Variable
-import torch
-from torch import nn
-import numpy as np
-
-
-class NewForward:
-
-    def __init__(self, layer):
-        self.layer = layer
-        self.apply_patch()
-
-    def apply_patch(self):
-        self.layer.old_forward_for_cp = self.layer.forward
-        self.layer.forward = self.new_forward
-
-    def new_forward(self, *args, **kwargs):
-        def func(*args):
-            return self.layer.old_forward_for_cp(*args, **kwargs)
-        output = checkpoint(func, *args)
-        return output
-
-
-class VarWrapper:
-
-    def __init__(self, model):
-        self.model = model
-        self.apply_patch()
-        print('Var Wrapper Patch Applied')
-
-    def apply_patch(self):
-        self.model.old_forward_for_cp = self.model.forward
-        self.model.forward = self.new_forward
-
-    def new_forward(self, *args, **kwargs):
-        out = self.model.old_forward_for_cp(*args, **kwargs)
-        out = Variable(out.data, requires_grad=True)
-        return out
-
-
-def apply_gradient_checkpointing(model, checkpoint_ratio=1):
-    new_forwards = []
-    modules = []
-    for n, m in model.named_modules():
-        if isinstance(m, LlamaDecoderLayer):
-            modules.append(m)
-    if checkpoint_ratio < 1 and checkpoint_ratio > 0:
-        checkpoint_locs = np.array((np.linspace(0, 1, int(len(modules) * checkpoint_ratio)) * (len(modules)-1)).round(), dtype=int)
-    else:
-        checkpoint_locs = np.arange(len(modules))
-    for i in checkpoint_locs:
-        m = modules[i]
-        new_forwards.append(NewForward(m))
-        print('Forward Patch Applied For Block {}'.format(i))
-    for n, m in model.named_modules():
-        if isinstance(m, torch.nn.Embedding):
-            wrapper = VarWrapper(m)
-            break
-    return new_forwards, wrapper
@@ -1,44 +0,0 @@
-import os
-import sys
-import time
-import torch
-from autograd_4bit import load_llama_model_4bit_low_ram, Autograd4bitQuantLinear
-config_path = './llama-13b-4bit/'
-model_path = './llama-13b-4bit.pt'
-model, tokenizer = load_llama_model_4bit_low_ram(config_path, model_path, groupsize=-1)
-
-print('Fitting 4bit scales and zeros to half')
-model.half()
-for n, m in model.named_modules():
-    if isinstance(m, Autograd4bitQuantLinear):
-        if m.groupsize == -1:
-            m.zeros = m.zeros.half()
-        m.scales = m.scales.half()
-        m.bias = m.bias.half()
-
-print('Apply AMP Wrapper ...')
-from amp_wrapper import AMPWrapper
-wrapper = AMPWrapper(model)
-wrapper.apply_generate()
-
-prompt = '''I think the meaning of life is'''
-batch = tokenizer(prompt, return_tensors="pt", add_special_tokens=False)
-batch = {k: v.cuda() for k, v in batch.items()}
-
-start = time.time()
-with torch.no_grad():
-    generated = model.generate(inputs=batch["input_ids"],
-                               do_sample=True, use_cache=True,
-                               repetition_penalty=1.1,
-                               max_new_tokens=20,
-                               temperature=0.9,
-                               top_p=0.95,
-                               top_k=40,
-                               return_dict_in_generate=True,
-                               output_attentions=False,
-                               output_hidden_states=False,
-                               output_scores=False)
-result_text = tokenizer.decode(generated['sequences'].cpu().tolist()[0])
-end = time.time()
-print(result_text)
-print(end - start)
@@ -1,143 +0,0 @@
-import torch
-import numpy as np
-from gptq_llama import quant_cuda
-
-
-# Global Buffer
-buffer_mat_dic = {}
-use_new = True
-auto_switch = True
-auto_switch_thd = 8
-debug = False
-
-
-def get_buffer(shape_of_qweight, dtype=torch.float16, device='cuda'):
-    if shape_of_qweight not in buffer_mat_dic.keys():
-        buffer_mat_dic[shape_of_qweight] = torch.zeros((shape_of_qweight[0] * 8, shape_of_qweight[1]), dtype=dtype, device=device)
-    else:
-        if buffer_mat_dic[shape_of_qweight].device != device:
-            buffer_mat_dic[shape_of_qweight] = buffer_mat_dic[shape_of_qweight].to(device)
-        if buffer_mat_dic[shape_of_qweight].dtype != dtype:
-            buffer_mat_dic[shape_of_qweight] = buffer_mat_dic[shape_of_qweight].to(dtype=dtype)
-    return buffer_mat_dic[shape_of_qweight]
-
-
-def _matmul4bit_v1(x, qweight, scales, zeros):
-    """
-    input x: (n, m)
-    qweight: (j, k)
-    where m == j*8
-
-    perform x @ qweight
-
-    return y:
-    """
-    if debug:
-        print('_matmul4bit_v1')
-    assert qweight.shape[0] * 8 == x.shape[-1]
-    outshape = x.shape[:-1] + (qweight.shape[1],)
-    x = x.reshape(-1, x.shape[-1])
-    y = torch.zeros((x.shape[0], qweight.shape[-1]), dtype=torch.float32, device=x.device)
-    dtype = x.dtype
-    x = x.half()
-    quant_cuda.vecquant4matmul_v1_faster(x, qweight, y, scales, zeros)
-    y = y.to(dtype)
-    return y.reshape(outshape)
-
-
-def _matmul4bit_v2(x, qweight, scales, zeros, g_idx):
-    """
-    input x: (n, m)
-    qweight: (j, k)
-    where m == j*8
-
-    perform x @ qweight
-
-    return y:
-    """
-    if debug:
-        print('_matmul4bit_v2')
-    assert qweight.shape[0] * 8 == x.shape[-1]
-    outshape = x.shape[:-1] + (qweight.shape[1],)
-    x = x.reshape(-1, x.shape[-1])
-    y = torch.zeros((x.shape[0], qweight.shape[-1]), dtype=torch.float32, device=x.device)
-    dtype = x.dtype
-    x = x.half()
-    quant_cuda.vecquant4matmul_faster(x, qweight, y, scales, zeros, g_idx, x.shape[-1] // 2)
-    y = y.to(dtype)
-    return y.reshape(outshape)
-
-
-def _matmul4bit_v1_recons(x, qweight, scales, zeros, transpose=False):
-    if debug:
-        print('_matmul4bit_v1_recons')
-    if not transpose:
-        assert qweight.shape[0] * 8 == x.shape[-1]
-    else:
-        assert qweight.shape[1] == x.shape[-1]
-    buffer = get_buffer(qweight.shape, dtype=scales.dtype, device=qweight.device)
-    quant_cuda.vecquant4recons_v1(qweight, buffer, scales, zeros)
-    if not transpose:
-        output = torch.matmul(x, buffer)
-    else:
-        output = torch.matmul(x, buffer.T)
-    return output
-
-
-def _matmul4bit_v2_recons(x, qweight, scales, zeros, g_idx, transpose=False):
-    if debug:
-        print('_matmul4bit_v2_recons')
-    if not transpose:
-        assert qweight.shape[0] * 8 == x.shape[-1]
-    else:
-        assert qweight.shape[1] == x.shape[-1]
-    buffer = get_buffer(qweight.shape, dtype=scales.dtype, device=qweight.device)
-    quant_cuda.vecquant4recons_v2(qweight, buffer, scales, zeros, g_idx)
-    if not transpose:
-        output = torch.matmul(x, buffer)
-    else:
-        output = torch.matmul(x, buffer.T)
-    return output
-
-
-def matmul4bit(x, qweight, scales, zeros, g_idx=None):
-    # detect if zeros is int32
-    if zeros.dtype != torch.int32:
-        # use v1
-        if use_new:
-            if auto_switch:
-                if np.prod(x.shape[:-1]) > auto_switch_thd:
-                    output = _matmul4bit_v1_recons(x.to(scales.dtype), qweight, scales, zeros)
-                else:
-                    output = _matmul4bit_v1(x, qweight, scales.float(), zeros.float())
-        else:
-            output = _matmul4bit_v1(x, qweight, scales.float(), zeros.float())
-    else:
-        if g_idx is None:
-            g_idx = torch.zeros(qweight.shape[0] * 8, dtype=torch.int32, device=x.device)
-        # use v2
-        if use_new:
-            if auto_switch:
-                if np.prod(x.shape[:-1]) > auto_switch_thd:
-                    output = _matmul4bit_v2_recons(x.to(scales.dtype), qweight, scales, zeros, g_idx)
-                else:
-                    output = _matmul4bit_v2(x, qweight, scales.float(), zeros, g_idx)
-        else:
-            output = _matmul4bit_v2(x, qweight, scales.float(), zeros, g_idx)
-    return output
-
-
-def v2_to_v1(scales, zeros):
-    """
-    Convert zeros in V2 model to V1 model when group_num = 1, for debugging
-    depreciated
-    """
-    assert zeros.shape[0] == 1
-    z_mat = torch.zeros((zeros.shape[1], 256), dtype=torch.int, device=zeros.device) + zeros.reshape((-1,1))
-    z_buffer = torch.zeros((z_mat.shape[0] * 8, z_mat.shape[1]), dtype=torch.float16, device=zeros.device)
-    z_zeros = torch.zeros(z_mat.shape[1], dtype=torch.float16, device=zeros.device)
-    z_scales = torch.ones(z_mat.shape[1], dtype=torch.float16, device=zeros.device)
-    quant_cuda.vecquant4recons_v1(z_mat, z_buffer, z_scales, z_zeros)
-    z_buffer = z_buffer[:,0]
-    zeros_recons = z_buffer * scales + scales
-    return zeros_recons
@@ -0,0 +1,57 @@
+
+My personal repo to convert models from Lora to huggingface/ggml/gptq 4bit so I can run them in normal text-webui and llama.cpp
+
+How do we do this?
+
+1. lora -> hf
+    - [tloen/alpaca-lora/export_hf_checkpoint.py](https://github.com/tloen/alpaca-lora/blob/main/export_hf_checkpoint.py)
+2. hf -> 4bit
+    - using [GPTQ-for-LLaMa/llama.py](https://github.com/qwopqwop200/GPTQ-for-LLaMa/blob/triton/llama.py)
+    `CUDA_VISIBLE_DEVICES=0 python llama.py ./llama-hf/llama-7b c4 --wbits 4 --true-sequential --act-order --groupsize 128 --save llama7b-4bit-128g.pt`
+3) and to ggml
+    - [llama.cpp/convert-pth-to-ggml.py](https://github.com/ggerganov/llama.cpp/blob/master/convert-pth-to-ggml.py)
+
+
+# TODO
+
+- [ ] lora -> hf
+- [ ] hf -> 4bit
+- [ ] hf -> ggml
+
+# setup env
+
+```sh
+
+conda create -n textgen3 python=3.10.9
+conda activate textgen3
+mamba install pytorch torchvision torchaudio pytorch-cuda=11.7 cudatoolkit-dev==11.7  cudatoolkit=11.7 -c pytorch -c nvidia  -c conda-forge 
+```
+
+# download models
+
+```sh
+# # base models.... FIXME
+# wget https://huggingface.co/maderix/llama-65b-4bit/resolve/main/llama30b-4bit.pt ../llama-30b-4bit.pt
+# wget https://huggingface.co/maderix/llama-65b-4bit/resolve/main/llama13b-4bit.pt ../llama-13b-4bit.pt
+# wget https://huggingface.co/maderix/llama-65b-4bit/resolve/main/llama7b-4bit.pt ../llama-7b-4bit.pt
+# cools models:
+# - https://huggingface.co/jordiclive/gpt4all-alpaca-oa-codealpaca-lora-13b
+# - https://huggingface.co/Black-Engineer/oasst-llama30b-ggml-q4
+# - https://huggingface.co/chansung/alpaca-lora-30b
+
+# download loras
+python scripts/download-model.py chansung/alpaca-lora-30b
+python scripts/download-model.py chansung/alpaca-lora-13b
+python scripts/download-model.py tloen/alpaca-lora-7b
+```
+
+# convert models
+
+```sh
+python scripts/export_hf_checkpoint.py ./models/llama-7b-hf -l loras/tloen_alpaca-lora-7b
+```
+
+
+# Links
+
+- https://github.com/s4rduk4r/alpaca_lora_4bit_readme/blob/main/README.md
@@ -1,144 +0,0 @@
-from typing import List, Optional, Tuple
-
-import torch
-from torch import nn
-
-import transformers
-from transformers.models.llama.modeling_llama import LlamaConfig, LlamaRotaryEmbedding, apply_rotary_pos_emb
-
-from einops import rearrange
-
-from flash_attn.flash_attn_interface import flash_attn_unpadded_qkvpacked_func
-from flash_attn.bert_padding import unpad_input, pad_input
-
-class LlamaAttention(nn.Module):
-    """Multi-headed attention from 'Attention Is All You Need' paper"""
-
-    def __init__(
-        self,
-        config: LlamaConfig,
-    ):
-        super().__init__()
-        hidden_size = config.hidden_size
-        num_heads = config.num_attention_heads
-        self.hidden_size = hidden_size
-        self.num_heads = num_heads
-        self.head_dim = self.hidden_size // num_heads
-
-        if (self.head_dim * num_heads) != self.hidden_size:
-            raise ValueError(
-                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
-                f" and `num_heads`: {num_heads}).")
-        self.q_proj = nn.Linear(
-            hidden_size,
-            num_heads * self.head_dim,
-            bias=False,
-        )
-        self.k_proj = nn.Linear(
-            hidden_size,
-            num_heads * self.head_dim,
-            bias=False,
-        )
-        self.v_proj = nn.Linear(
-            hidden_size,
-            num_heads * self.head_dim,
-            bias=False,
-        )
-        self.o_proj = nn.Linear(
-            num_heads * self.head_dim,
-            hidden_size,
-            bias=False,
-        )
-        self.rotary_emb = LlamaRotaryEmbedding(self.head_dim)
-
-    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
-        return tensor.view(bsz, seq_len, self.num_heads,
-                           self.head_dim).transpose(1, 2).contiguous()
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor],
-               Optional[Tuple[torch.Tensor]]]:
-        """Input shape: Batch x Time x Channel
-        
-        attention_mask: [bsz, q_len]
-        """
-        bsz, q_len, _ = hidden_states.size()
-
-        query_states = self.q_proj(hidden_states).view(
-            bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = self.k_proj(hidden_states).view(
-            bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        value_states = self.v_proj(hidden_states).view(
-            bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        # [bsz, q_len, nh, hd]
-        # [bsz, nh, q_len, hd]
-
-        kv_seq_len = key_states.shape[-2]
-        if past_key_value is not None:
-            kv_seq_len += past_key_value[0].shape[-2]
-
-        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
-        query_states, key_states = apply_rotary_pos_emb(query_states,
-                                                        key_states,
-                                                        cos,
-                                                        sin,
-                                                        position_ids)
-        # [bsz, nh, t, hd]
-        assert not output_attentions, "output_attentions is not supported"
-        assert not use_cache, "use_cache is not supported"
-        assert past_key_value is None, "past_key_value is not supported"
-
-        # Flash attention codes from
-        # https://github.com/HazyResearch/flash-attention/blob/main/flash_attn/flash_attention.py
-
-        # transform the data into the format required by flash attention
-        qkv = torch.stack([query_states, key_states, value_states], dim=2) # [bsz, nh, 3, q_len, hd]
-        qkv = qkv.transpose(1, 3) # [bsz, q_len, 3, nh, hd]
-        # We have disabled _prepare_decoder_attention_mask in LlamaModel
-        # the attention_mask should be the same as the key_padding_mask
-        key_padding_mask = attention_mask
-
-
-        if key_padding_mask is None:
-            qkv = rearrange(qkv, 'b s ... -> (b s) ...')
-            max_s = q_len
-            cu_q_lens = torch.arange(0, (bsz + 1) * q_len, step=q_len, dtype=torch.int32,
-                                    device=qkv.device)
-            output = flash_attn_unpadded_qkvpacked_func(
-                qkv, cu_q_lens, max_s, 0.0,
-                softmax_scale=None, causal=True
-            )
-            output = rearrange(output, '(b s) ... -> b s ...', b=bsz)
-        else:
-            nheads = qkv.shape[-2]
-            x = rearrange(qkv, 'b s three h d -> b s (three h d)')
-            x_unpad, indices, cu_q_lens, max_s = unpad_input(x, key_padding_mask)
-            x_unpad = rearrange(x_unpad, 'nnz (three h d) -> nnz three h d', three=3, h=nheads)
-            output_unpad = flash_attn_unpadded_qkvpacked_func(
-                x_unpad, cu_q_lens, max_s, 0.0,
-                softmax_scale=None, causal=True
-            )
-            output = rearrange(pad_input(rearrange(output_unpad, 'nnz h d -> nnz (h d)'),
-                                        indices, bsz, q_len),
-                            'b s (h d) -> b s h d', h=nheads)
-        return self.o_proj(rearrange(output,
-                                     'b s h d -> b s (h d)')), None, None
-
-
-# Copied from transformers.models.bart.modeling_bart.BartDecoder._prepare_decoder_attention_mask
-def _prepare_decoder_attention_mask(self, attention_mask, input_shape,
-                                    inputs_embeds, past_key_values_length):
-    # [bsz, seq_len]
-    return attention_mask
-
-
-def replace_llama_attn_with_flash_attn():
-    transformers.models.llama.modeling_llama.LlamaModel._prepare_decoder_attention_mask = _prepare_decoder_attention_mask
-    transformers.models.llama.modeling_llama.LlamaAttention = LlamaAttention
@@ -1,12 +1,13 @@
-torch
+# torch
 accelerate
 bitsandbytes
 datasets
 sentencepiece
 safetensors
-flash-attn
+# flash-attn
 triton
 colorama
-git+https://github.com/huggingface/transformers.git
+git+https://github.com/huggingface/transformers.git@656e869
 git+https://github.com/sterlind/GPTQ-for-LLaMa.git@lora_4bit
-git+https://github.com/sterlind/peft.git
+# git+https://github.com/sterlind/peft.git@085c09d
+git+https://github.com/wassname/peft.git
@@ -0,0 +1,37 @@
+'''
+clones models from Hugging Face to models/model-name.
+
+Example:
+python clone-model.py facebook/opt-1.3b
+
+'''
+
+from git import Repo
+import argparse
+from tqdm.auto import tqdm
+from git import RemoteProgress
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument('MODEL', type=str, default=None, help="`tloen/alpaca-lora-7b`")
+parser.add_argument('--branch', type=str, default='main', help='Name of the Git branch to download from.')
+args = parser.parse_args()
+
+class CloneProgress(RemoteProgress):
+    """tqdm progress bar for GitPython"""
+    def __init__(self):
+        super().__init__()
+        self.pbar = tqdm()
+
+    def update(self, op_code, cur_count, max_count=None, message=''):
+        self.pbar.total = max_count
+        self.pbar.n = cur_count
+        self.pbar.refresh()
+
+if __name__ == '__main__':
+    model = args.MODEL
+    repo = 'https://huggingface.co/' + model
+    name = model.replace('/', '_')
+    dest = f'./models/{name}'
+    print(f'cloning "{repo}" to "{dest}"')
+    Repo.clone_from(repo, dest, progress=CloneProgress())
@@ -0,0 +1,275 @@
+'''
+Downloads models from Hugging Face to models/model-name.
+
+Example:
+python download-model.py facebook/opt-1.3b
+
+From https://raw.githubusercontent.com/oobabooga/text-generation-webui/main/download-model.py
+
+'''
+
+import argparse
+import base64
+import datetime
+import hashlib
+import json
+import re
+import sys
+from pathlib import Path
+
+import requests
+import tqdm
+from tqdm.contrib.concurrent import thread_map
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument('MODEL', type=str, default=None, nargs='?')
+parser.add_argument('--branch', type=str, default='main', help='Name of the Git branch to download from.')
+parser.add_argument('--threads', type=int, default=1, help='Number of files to download simultaneously.')
+parser.add_argument('--text-only', action='store_true', help='Only download text files (txt/json).')
+parser.add_argument('--output', type=str, default=None, help='The folder where the model should be saved.')
+parser.add_argument('--clean', action='store_true', help='Does not resume the previous download.')
+parser.add_argument('--check', action='store_true', help='Validates the checksums of model files.')
+args = parser.parse_args()
+
+
+def select_model_from_default_options():
+    models = {
+        "OPT 6.7B": ("facebook", "opt-6.7b", "main"),
+        "OPT 2.7B": ("facebook", "opt-2.7b", "main"),
+        "OPT 1.3B": ("facebook", "opt-1.3b", "main"),
+        "OPT 350M": ("facebook", "opt-350m", "main"),
+        "GALACTICA 6.7B": ("facebook", "galactica-6.7b", "main"),
+        "GALACTICA 1.3B": ("facebook", "galactica-1.3b", "main"),
+        "GALACTICA 125M": ("facebook", "galactica-125m", "main"),
+        "Pythia-6.9B-deduped": ("EleutherAI", "pythia-6.9b-deduped", "main"),
+        "Pythia-2.8B-deduped": ("EleutherAI", "pythia-2.8b-deduped", "main"),
+        "Pythia-1.4B-deduped": ("EleutherAI", "pythia-1.4b-deduped", "main"),
+        "Pythia-410M-deduped": ("EleutherAI", "pythia-410m-deduped", "main"),
+    }
+    choices = {}
+
+    print("Select the model that you want to download:\n")
+    for i, name in enumerate(models):
+        char = chr(ord('A') + i)
+        choices[char] = name
+        print(f"{char}) {name}")
+    char = chr(ord('A') + len(models))
+    print(f"{char}) None of the above")
+
+    print()
+    print("Input> ", end='')
+    choice = input()[0].strip().upper()
+    if choice == char:
+        print("""\nThen type the name of your desired Hugging Face model in the format organization/name.
+
+Examples:
+facebook/opt-1.3b
+EleutherAI/pythia-1.4b-deduped
+""")
+
+        print("Input> ", end='')
+        model = input()
+        branch = "main"
+    else:
+        arr = models[choices[choice]]
+        model = f"{arr[0]}/{arr[1]}"
+        branch = arr[2]
+
+    return model, branch
+
+
+def sanitize_model_and_branch_names(model, branch):
+    if model[-1] == '/':
+        model = model[:-1]
+    if branch is None:
+        branch = "main"
+    else:
+        pattern = re.compile(r"^[a-zA-Z0-9._-]+$")
+        if not pattern.match(branch):
+            raise ValueError("Invalid branch name. Only alphanumeric characters, period, underscore and dash are allowed.")
+
+    return model, branch
+
+
+def get_download_links_from_huggingface(model, branch, text_only=False):
+    base = "https://huggingface.co"
+    page = f"/api/models/{model}/tree/{branch}?cursor="
+    cursor = b""
+
+    links = []
+    sha256 = []
+    classifications = []
+    has_pytorch = False
+    has_pt = False
+    has_ggml = False
+    has_safetensors = False
+    is_lora = False
+    while True:
+        content = requests.get(f"{base}{page}{cursor.decode()}").content
+
+        dict = json.loads(content)
+        if len(dict) == 0:
+            break
+
+        for i in range(len(dict)):
+            if 'error' in dict:
+                print("you might need to run `huggingface-cli login`")
+                raise Exception(dict['error'])
+            fname = dict[i]['path']
+            if not is_lora and fname.endswith(('adapter_config.json', 'adapter_model.bin')):
+                is_lora = True
+
+            is_pytorch = re.match("(pytorch|adapter)_model.*\.bin", fname)
+            is_safetensors = re.match(".*\.safetensors", fname)
+            is_pt = re.match(".*\.pt", fname)
+            is_ggml = re.match("ggml.*\.bin", fname)
+            is_tokenizer = re.match("tokenizer.*\.model", fname)
+            is_text = re.match(".*\.(txt|json|py|md)", fname) or is_tokenizer
+
+            if any((is_pytorch, is_safetensors, is_pt, is_ggml, is_tokenizer, is_text)):
+                if 'lfs' in dict[i]:
+                    sha256.append([fname, dict[i]['lfs']['oid']])
+                if is_text:
+                    links.append(f"https://huggingface.co/{model}/resolve/{branch}/{fname}")
+                    classifications.append('text')
+                    continue
+                if not text_only:
+                    links.append(f"https://huggingface.co/{model}/resolve/{branch}/{fname}")
+                    if is_safetensors:
+                        has_safetensors = True
+                        classifications.append('safetensors')
+                    elif is_pytorch:
+                        has_pytorch = True
+                        classifications.append('pytorch')
+                    elif is_pt:
+                        has_pt = True
+                        classifications.append('pt')
+                    elif is_ggml:
+                        has_ggml = True
+                        classifications.append('ggml')
+
+        cursor = base64.b64encode(f'{{"file_name":"{dict[-1]["path"]}"}}'.encode()) + b':50'
+        cursor = base64.b64encode(cursor)
+        cursor = cursor.replace(b'=', b'%3D')
+
+    # If both pytorch and safetensors are available, download safetensors only
+    if (has_pytorch or has_pt) and has_safetensors:
+        for i in range(len(classifications) - 1, -1, -1):
+            if classifications[i] in ['pytorch', 'pt']:
+                links.pop(i)
+
+    return links, sha256, is_lora
+
+
+def get_output_folder(model, branch, is_lora, base_folder=None):
+    if base_folder is None:
+        base_folder = 'models' if not is_lora else 'loras'
+
+    output_folder = f"{'_'.join(model.split('/')[-2:])}"
+    if branch != 'main':
+        output_folder += f'_{branch}'
+    output_folder = Path(base_folder) / output_folder
+    return output_folder
+
+
+def get_single_file(url, output_folder, start_from_scratch=False):
+    filename = Path(url.rsplit('/', 1)[1])
+    output_path = output_folder / filename
+    if output_path.exists() and not start_from_scratch:
+        # Check if the file has already been downloaded completely
+        r = requests.get(url, stream=True)
+        total_size = int(r.headers.get('content-length', 0))
+        if output_path.stat().st_size >= total_size:
+            return
+        # Otherwise, resume the download from where it left off
+        headers = {'Range': f'bytes={output_path.stat().st_size}-'}
+        mode = 'ab'
+    else:
+        headers = {}
+        mode = 'wb'
+
+    r = requests.get(url, stream=True, headers=headers)
+    with open(output_path, mode) as f:
+        total_size = int(r.headers.get('content-length', 0))
+        block_size = 1024
+        with tqdm.tqdm(total=total_size, unit='iB', unit_scale=True, bar_format='{l_bar}{bar}| {n_fmt:6}/{total_fmt:6} {rate_fmt:6}') as t:
+            for data in r.iter_content(block_size):
+                t.update(len(data))
+                f.write(data)
+
+
+def start_download_threads(file_list, output_folder, start_from_scratch=False, threads=1):
+    thread_map(lambda url: get_single_file(url, output_folder, start_from_scratch=start_from_scratch), file_list, max_workers=threads, disable=True)
+
+
+def download_model_files(model, branch, links, sha256, output_folder, start_from_scratch=False, threads=1):
+    # Creating the folder and writing the metadata
+    if not output_folder.exists():
+        output_folder.mkdir()
+    with open(output_folder / 'huggingface-metadata.txt', 'w') as f:
+        f.write(f'url: https://huggingface.co/{model}\n')
+        f.write(f'branch: {branch}\n')
+        f.write(f'download date: {str(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))}\n')
+        sha256_str = ''
+        for i in range(len(sha256)):
+            sha256_str += f'    {sha256[i][1]} {sha256[i][0]}\n'
+        if sha256_str != '':
+            f.write(f'sha256sum:\n{sha256_str}')
+
+    # Downloading the files
+    print(f"Downloading the model to {output_folder}")
+    start_download_threads(links, output_folder, start_from_scratch=start_from_scratch, threads=threads)
+
+
+def check_model_files(model, branch, links, sha256, output_folder):
+    # Validate the checksums
+    validated = True
+    for i in range(len(sha256)):
+        fpath = (output_folder / sha256[i][0])
+
+        if not fpath.exists():
+            print(f"The following file is missing: {fpath}")
+            validated = False
+            continue
+
+        with open(output_folder / sha256[i][0], "rb") as f:
+            bytes = f.read()
+            file_hash = hashlib.sha256(bytes).hexdigest()
+            if file_hash != sha256[i][1]:
+                print(f'Checksum failed: {sha256[i][0]}  {sha256[i][1]}')
+                validated = False
+            else:
+                print(f'Checksum validated: {sha256[i][0]}  {sha256[i][1]}')
+
+    if validated:
+        print('[+] Validated checksums of all model files!')
+    else:
+        print('[-] Invalid checksums. Rerun download-model.py with the --clean flag.')
+
+
+if __name__ == '__main__':
+    branch = args.branch
+    model = args.MODEL
+    if model is None:
+        model, branch = select_model_from_default_options()
+
+    # Cleaning up the model/branch names
+    try:
+        model, branch = sanitize_model_and_branch_names(model, branch)
+    except ValueError as err_branch:
+        print(f"Error: {err_branch}")
+        sys.exit()
+
+    # Getting the download links from Hugging Face
+    links, sha256, is_lora = get_download_links_from_huggingface(model, branch, text_only=args.text_only)
+
+    # Getting the output folder
+    output_folder = get_output_folder(model, branch, is_lora, base_folder=args.output)
+
+    if args.check:
+        # Check previously downloaded files
+        check_model_files(model, branch, links, sha256, output_folder)
+    else:
+        # Download files
+        download_model_files(model, branch, links, sha256, output_folder, threads=args.threads)
@@ -0,0 +1,81 @@
+"""
+From https://raw.githubusercontent.com/tloen/alpaca-lora/main/export_hf_checkpoint.py
+"""
+import os
+from pathlib import Path
+import argparse
+import torch
+import transformers
+from peft import PeftModel
+from transformers import LlamaForCausalLM, LlamaTokenizer  # noqa: F402
+
+def main(BASE_MODEL, LORA_MODEL, output_path=None):
+    
+    if output_path is None:
+        output_path = 'models/' + LORA_MODEL.split('/')[-1] + '-delorified'
+
+    # BASE_MODEL = os.environ.get("BASE_MODEL", None)
+    # assert (
+    #     BASE_MODEL
+    # ), "Please specify a value for BASE_MODEL environment variable, e.g. `export BASE_MODEL=huggyllama/llama-7b`"  # noqa: E501
+
+
+    # LORA_MODEL = os.environ.get("BASE_MODEL", None)
+    # assert (
+    #     LORA_MODEL
+    # ), "Please specify a value for LORA_MODEL environment variable, e.g. `export BASE_MODEL=tloen/alpaca-lora-7b`"  # noqa: E501
+
+    tokenizer = LlamaTokenizer.from_pretrained(BASE_MODEL)
+
+    base_model = LlamaForCausalLM.from_pretrained(
+        BASE_MODEL,
+        load_in_8bit=False,
+        torch_dtype=torch.float16,
+        device_map={"": "cpu"},
+    )
+
+    first_weight = base_model.model.layers[0].self_attn.q_proj.weight
+    first_weight_old = first_weight.clone()
+
+    lora_model = PeftModel.from_pretrained(
+        base_model,
+        LORA_MODEL,
+        device_map={"": "cpu"},
+        torch_dtype=torch.float16,
+    )
+
+    lora_weight = lora_model.base_model.model.model.layers[
+        0
+    ].self_attn.q_proj.weight
+
+    assert torch.allclose(first_weight_old, first_weight)
+
+    # merge weights - new merging method from peft
+    lora_model = lora_model.merge_and_unload()
+
+    lora_model.train(False)
+
+    # did we do anything?
+    assert not torch.allclose(first_weight_old, first_weight)
+
+    lora_model_sd = lora_model.state_dict()
+    deloreanized_sd = {
+        k.replace("base_model.model.", ""): v
+        for k, v in lora_model_sd.items()
+        if "lora" not in k
+    }
+
+    LlamaForCausalLM.save_pretrained(
+        base_model, output_path, state_dict=deloreanized_sd, max_shard_size="400MB"
+    )
+    print(f'output {output_path}')
+
+if __name__=="__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('model', type=str)
+    parser.add_argument('-l', '--lora', type=str, default='main', help='Lora repo or path e.g. `tloen/alpaca-lora-7b`')
+    parser.add_argument('-o', '--output', type=Path, default=None)
+    "e.g. ./hf_ckpt. default will be lora name"
+    args = parser.parse_args()
+    main(args.model, args.lora, args.output)
+
@@ -1,273 +0,0 @@
-import torch
-
-from abc import ABC, abstractmethod
-from typing import Dict, Any
-from datasets import load_dataset, Dataset
-from torch.utils.data import DataLoader
-from transformers import DefaultDataCollator
-import os
-
-
-# Abstract train data loader
-class ATrainData(ABC):
-    """
-    """
-    @abstractmethod
-    def __init__(self, dataset: str, val_set_size: int, tokenizer, cutoff_len: int) -> None:
-        """
-        Args:
-            dataset (str): Path to dataset
-            val_set_size (int) : Size of validation set
-            tokenizer (_type_): Tokenizer
-        """
-        self.tokenizer = tokenizer
-        self.dataset = dataset
-        self.val_set_size = val_set_size
-        self.cutoff_len = cutoff_len
-        self.train_data = None
-        self.val_data = None
-
-    @abstractmethod
-    def tokenize(self, prompt: str) -> Dict[str, Any]:
-        """Tokenization method
-
-        Args:
-            prompt (str): Prompt string from dataset
-
-        Returns:
-            Dict[str, Any]: token
-        """
-        pass
-
-    @abstractmethod
-    def prepare_data(self) -> None:
-        """Loads dataset from file and prepares train_data property for trainer
-        """
-        pass
-
-
-# LLaMA txt train data loader
-class TrainTxt(ATrainData):
-    def __init__(self, dataset: str, val_set_size: int, tokenizer, cutoff_len):
-        super().__init__(dataset, val_set_size, tokenizer, cutoff_len)  # TODO: Validation size isn't used
-        self.cutoff_len = cutoff_len
-        self.exceed_count = 0
-
-    def tokenize(self, prompt: str, use_eos_token=True, **kwargs) -> Dict[str, Any]:
-        # there's probably a way to do this with the tokenizer settings
-        # but again, gotta move fast
-        if use_eos_token:
-            result = self.tokenizer(
-                prompt + self.tokenizer.eos_token,
-                truncation=True,
-                max_length=self.cutoff_len,
-                padding=False,
-            )
-            d = {
-                "input_ids": result["input_ids"],
-                "attention_mask": result["attention_mask"],
-            }
-            if (
-                d["input_ids"][-1] != self.tokenizer.eos_token_id
-                and len(d["input_ids"]) < self.cutoff_len
-            ):
-                d["input_ids"].append(self.tokenizer.eos_token_id)
-                d["attention_mask"].append(1)
-        else:
-            result = self.tokenizer(
-                prompt,
-                truncation=True,
-                max_length=self.cutoff_len + 1,
-                padding="max_length",
-            )
-            d = {
-                "input_ids": result["input_ids"][:-1],
-                "attention_mask": result["attention_mask"][:-1],
-            }
-        if sum(d['attention_mask']) >= self.cutoff_len:
-            self.exceed_count += 1
-        return d
-
-    @classmethod
-    def format_new_rows(cls, rows, thd=128):
-        r_b = ''
-        new_rows = []
-        for row in rows:
-            if len(r_b) == 0:
-                r_b += row
-            else:
-                r_b += '\n' + row
-            if len(r_b) > thd:
-                new_rows.append(r_b)
-                r_b = ''
-        if len(r_b) > thd:
-            new_rows.append(r_b)
-            r_b = ''
-        return new_rows
-
-    def prepare_data(self, thd=-1, use_eos_token=True, **kwargs):
-        if os.path.isdir(self.dataset):
-            rows = []
-            for filename in os.listdir(self.dataset):
-                with open(self.dataset + filename, 'r', encoding='utf8') as file:
-                    txt = file.read()
-                txt = txt.replace('\r\n', '\n').replace('\u3000', ' ')
-                rows += [r for r in txt.split('\n') if r != '']
-        else:
-            with open(self.dataset, 'r', encoding='utf8') as file:
-                txt = file.read()
-            txt = txt.replace('\r\n', '\n')
-            rows = [r for r in txt.split('\n') if r != '']
-        if thd != -1:
-            rows = self.format_new_rows(rows, thd=thd)
-        data = Dataset.from_dict({"input": rows})
-        data = data.shuffle().map(lambda x: self.tokenize(x["input"], use_eos_token=use_eos_token))
-        print('Train Data: {:.2f}%'.format(self.exceed_count / len(data) * 100), 'outliers')
-        self.train_data = data
-
-
-# Stanford Alpaca-like Data
-class TrainSAD(ATrainData):
-    def __init__(self, dataset: str, val_set_size: int, tokenizer, cutoff_len) -> None:
-        super().__init__(dataset, val_set_size, tokenizer, cutoff_len)
-
-    def tokenize(self, prompt: str, use_eos_token=True, **kwargs) -> Dict[str, Any]:
-        # there's probably a way to do this with the tokenizer settings
-        # but again, gotta move fast
-        if use_eos_token:
-            result = self.tokenizer(
-                prompt + self.tokenizer.eos_token,
-                truncation=True,
-                max_length=self.cutoff_len,
-                padding=False,
-            )
-            if (
-                result["input_ids"][-1] != self.tokenizer.eos_token_id
-                and len(result["input_ids"]) < self.cutoff_len
-            ):
-                result["input_ids"].append(self.tokenizer.eos_token_id)
-                result["attention_mask"].append(1)
-            return result
-        else:
-            result = self.tokenizer(
-                prompt,
-                truncation=True,
-                max_length=self.cutoff_len + 1,
-                padding="max_length",
-            )
-            return {
-                "input_ids": result["input_ids"][:-1],
-                "attention_mask": result["attention_mask"][:-1],
-            }
-
-    def prepare_data(self, use_eos_token=True, **kwargs) -> None:
-        data = load_dataset("json", data_files=self.dataset)
-
-        if self.val_set_size > 0:
-            train_val = data["train"].train_test_split(
-                test_size=self.val_set_size, shuffle=True, seed=42  # ! Seed = 42 (?)
-            )
-            self.train_data = train_val["train"].shuffle().map(lambda x: self.generate_and_tokenize_prompt(x, use_eos_token=use_eos_token))
-            self.val_data = train_val["test"].shuffle().map(lambda x: self.generate_and_tokenize_prompt(x, use_eos_token=use_eos_token))
-        else:
-            self.train_data = data["train"].shuffle().map(lambda x: self.generate_and_tokenize_prompt(x, use_eos_token=use_eos_token))
-            self.val_data = None
-
-    # Auxiliary methods
-    def generate_prompt(self, data_point, **kwargs):
-        return "{0}\n\n{1}\n{2}\n\n{3}\n{4}\n\n{5}\n{6}".format(
-            "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.",
-            "### Instruction:",
-            data_point["instruction"],
-            "### Input:",
-            data_point["input"],
-            "### Response:",
-            data_point["output"]
-        )
-
-    def generate_and_tokenize_prompt(self, data_point, **kwargs):
-        prompt = self.generate_prompt(data_point, **kwargs)
-        return self.tokenize(prompt, **kwargs)
-
-# GPT4All-like Data
-class TrainGPT4All(ATrainData):
-    def __init__(self, dataset: str, val_set_size: int, tokenizer, cutoff_len) -> None:
-        super().__init__(dataset, val_set_size, tokenizer, cutoff_len)
-
-    def tokenize(self, prompt: str, use_eos_token=True, **kwargs) -> Dict[str, Any]:
-        pass
-
-    def tokenize_inputs(self, examples):
-        max_length = self.cutoff_len
-        input_ids = torch.full((len(examples["prompt"]), max_length), self.tokenizer.pad_token_id)
-        # ignore bos
-        newline_tokens = self.tokenizer("\n", return_tensors="pt")["input_ids"][0, 1:]
-
-        out = {"labels": [], "attention_mask": []}
-        for i, (prompt, response) in enumerate(zip(examples["prompt"], examples["response"])):
-            input_tokens = self.tokenizer(prompt, truncation=True, max_length=max_length // 2, return_tensors="pt")["input_ids"].squeeze()
-            if input_tokens.dim() == 0:
-                input_tokens = input_tokens.unsqueeze(0)
-
-            input_len = len(input_tokens)
-
-            # plus one since we remove bos from response
-            # but we subtract one since we want to add eos token
-            remaining_tokens = max_length - input_len - len(newline_tokens) + 1
-            # remove bos
-            target_tokens = self.tokenizer(response, truncation=True, max_length=remaining_tokens, return_tensors="pt")["input_ids"].squeeze()[1:]
-
-            input_ids[i, :input_len] = input_tokens
-            # add newline between prompt and response
-            newline_plus_inputs = input_len + len(newline_tokens)
-            input_ids[i, input_len: newline_plus_inputs] = newline_tokens
-
-            # add target tokens, remove bos
-            input_ids[i, newline_plus_inputs: newline_plus_inputs + len(target_tokens)] = target_tokens
-            # add eos token, enforce stopping if we don't truncate
-            # we don't want long code to stop generating if truncated during training
-            if newline_plus_inputs + len(target_tokens) < max_length:
-                input_ids[i, newline_plus_inputs + len(target_tokens)] = self.tokenizer.eos_token_id
-
-            labels = input_ids[i].clone()
-            labels[: newline_plus_inputs] = -100
-            labels[labels == self.tokenizer.pad_token_id] = -100
-            # to debug this, can set all values == -100 to the pad token, then assert that tokenizer.decode(labels, skip_special_tokens=True).strip() == response
-
-            attention_mask = input_ids[i].ne(self.tokenizer.pad_token_id).int()
-
-            out["labels"].append(labels)
-            out["attention_mask"].append(attention_mask)
-
-        out["input_ids"] = input_ids
-
-        out = {k: torch.stack(v) if isinstance(v, list) else v for k, v in out.items()}
-
-        return out
-
-    def prepare_data(self, **kwargs) -> None:
-        dataset = load_dataset("json", data_files=self.dataset)
-
-        self.val_data = None
-        if self.val_set_size > 0:
-            dataset = dataset["train"].train_test_split(
-                test_size=self.val_set_size, shuffle=True, seed=42  # ! Seed = 42 (?)
-            )
-            train_dataset, val_dataset = dataset["train"], dataset["test"]
-
-            # tokenize inputs and return labels and attention mask
-            val_dataset = val_dataset.map(
-                lambda ele: self.tokenize_inputs(ele),
-                batched=True,
-                remove_columns=["source", "prompt"],
-            )
-            self.val_data = val_dataset.with_format("torch")
-        else:
-            train_dataset = dataset["train"]
-
-        train_dataset = train_dataset.map(
-            lambda ele: self.tokenize_inputs(ele),
-            batched=True,
-            remove_columns=["source", "prompt"],
-        )
-        self.train_data = train_dataset.with_format("torch")
@@ -1,246 +0,0 @@
-import triton
-import triton.language as tl
-import torch
-import custom_autotune
-
-
-# code based https://github.com/fpgaminer/GPTQ-triton
-@custom_autotune.autotune(
-    configs=[
-        triton.Config({'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
-        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
-        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
-        # These provided a benefit on a 3090
-        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
-        triton.Config({'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
-        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
-        triton.Config({'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
-        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
-        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
-        triton.Config({'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 128, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
-    ],
-    key=['M', 'N'],
-    nearest_power_of_two=True,
-)
-
-
-@triton.jit
-def matmul_248_kernel(a_ptr, b_ptr, c_ptr,
-                      scales_ptr, zeros_ptr, g_ptr,
-                      M, N, K, bits, maxq,
-                      stride_am, stride_ak,
-                      stride_bk, stride_bn,
-                      stride_cm, stride_cn,
-                      stride_scales, stride_zeros,
-                      BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
-                      GROUP_SIZE_M: tl.constexpr):
-    """
-    Compute the matrix multiplication C = A x B.
-    A is of shape (M, K) float16
-    B is of shape (K//8, N) int32
-    C is of shape (M, N) float16
-    scales is of shape (G, N) float16
-    zeros is of shape (G, N) float16
-    g_ptr is of shape (K) int32 
-    """
-    infearure_per_bits = 32 // bits
-
-    pid = tl.program_id(axis=0)
-    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
-    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
-    num_pid_k = tl.cdiv(K, BLOCK_SIZE_K)
-    num_pid_in_group = GROUP_SIZE_M * num_pid_n
-    group_id = pid // num_pid_in_group
-    first_pid_m = group_id * GROUP_SIZE_M
-    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
-    pid_m = first_pid_m + (pid % group_size_m)
-    pid_n = (pid % num_pid_in_group) // group_size_m
-
-    offs_am = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_bn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    offs_k = tl.arange(0, BLOCK_SIZE_K)
-    a_ptrs = a_ptr + (offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak)   # (BLOCK_SIZE_M, BLOCK_SIZE_K)
-    a_mask = (offs_am[:, None] < M)
-    # b_ptrs is set up such that it repeats elements along the K axis 8 times
-    b_ptrs = b_ptr + ((offs_k[:, None] // infearure_per_bits) * stride_bk + offs_bn[None, :] * stride_bn)   # (BLOCK_SIZE_K, BLOCK_SIZE_N)
-    g_ptrs = g_ptr + offs_k
-    # shifter is used to extract the N bits of each element in the 32-bit word from B
-    scales_ptrs = scales_ptr + offs_bn[None, :]
-    zeros_ptrs = zeros_ptr + (offs_bn[None, :] // infearure_per_bits) 
-    
-    shifter = (offs_k % infearure_per_bits) * bits
-    zeros_shifter = (offs_bn % infearure_per_bits) * bits
-    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-            
-    for k in range(0, num_pid_k):
-        g_idx = tl.load(g_ptrs)
-
-        # Fetch scales and zeros; these are per-outfeature and thus reused in the inner loop
-        scales = tl.load(scales_ptrs + g_idx[:, None] * stride_scales)  # (BLOCK_SIZE_K, BLOCK_SIZE_N,)
-        zeros = tl.load(zeros_ptrs + g_idx[:, None] * stride_zeros)  # (BLOCK_SIZE_K, BLOCK_SIZE_N,)
-        
-        zeros = (zeros >> zeros_shifter[None, :]) & maxq
-        zeros = (zeros + 1)
-        
-        a = tl.load(a_ptrs, mask=a_mask, other=0.0)   # (BLOCK_SIZE_M, BLOCK_SIZE_K)
-        b = tl.load(b_ptrs)   # (BLOCK_SIZE_K, BLOCK_SIZE_N), but repeated
-
-        # Now we need to unpack b (which is N-bit values) into 32-bit values
-        b = (b >> shifter[:, None]) & maxq  # Extract the N-bit values
-        b = (b - zeros) * scales  # Scale and shift
-        # ! Convert to fp16
-        b = b.to(tl.float16)
-        a = a.to(tl.float16)
-
-        accumulator += tl.dot(a, b)
-        a_ptrs += BLOCK_SIZE_K
-        b_ptrs += (BLOCK_SIZE_K // infearure_per_bits) * stride_bk
-        g_ptrs += BLOCK_SIZE_K
-
-    c = accumulator.to(tl.float16)
-    c_ptrs = c_ptr + stride_cm * offs_am[:, None] + stride_cn * offs_bn[None, :]
-    c_mask = (offs_am[:, None] < M) & (offs_bn[None, :] < N)
-    tl.store(c_ptrs, c, mask=c_mask)
-    
-
-# code based https://github.com/fpgaminer/GPTQ-triton
-@custom_autotune.autotune(
-    configs=[
-        triton.Config({'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_K': 64, 'BLOCK_SIZE_N': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
-        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_K': 256, 'BLOCK_SIZE_N': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_K': 128, 'BLOCK_SIZE_N': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_K': 64, 'BLOCK_SIZE_N': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
-        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_K': 128, 'BLOCK_SIZE_N': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_K': 32, 'BLOCK_SIZE_N': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
-        # These provided a benefit on a 3090
-        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_K': 64, 'BLOCK_SIZE_N': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
-        triton.Config({'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_K': 64, 'BLOCK_SIZE_N': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
-        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_K': 32, 'BLOCK_SIZE_N': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
-        triton.Config({'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_K': 64, 'BLOCK_SIZE_N': 64, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
-        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_K': 64, 'BLOCK_SIZE_N': 64, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
-        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_K': 32, 'BLOCK_SIZE_N': 64, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
-        triton.Config({'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_K': 64, 'BLOCK_SIZE_N': 128, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
-    ],
-    key=['M', 'K'],
-    nearest_power_of_two=True,
-)
-
-
-@triton.jit
-def trans_matmul_248_kernel(a_ptr, b_ptr, c_ptr,
-                            scales_ptr, zeros_ptr, g_ptr,
-                            M, N, K, bits, maxq,
-                            stride_am, stride_ak,
-                            stride_bk, stride_bn,
-                            stride_cm, stride_cn,
-                            stride_scales, stride_zeros,
-                            BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
-                            GROUP_SIZE_M: tl.constexpr):
-    """
-    Compute the matrix multiplication C = A x B.
-    A is of shape (M, N) float16
-    B is of shape (K//8, N) int32
-    C is of shape (M, K) float16
-    scales is of shape (G, N) float16
-    zeros is of shape (G, N) float16
-    g_ptr is of shape (K) int32 
-    """
-    infearure_per_bits = 32 // bits
-
-    pid = tl.program_id(axis=0)
-    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
-    num_pid_k = tl.cdiv(K, BLOCK_SIZE_K)
-    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
-    num_pid_in_group = GROUP_SIZE_M * num_pid_k
-    group_id = pid // num_pid_in_group
-    first_pid_m = group_id * GROUP_SIZE_M
-    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
-    pid_m = first_pid_m + (pid % group_size_m)
-    pid_k = (pid % num_pid_in_group) // group_size_m
-
-    offs_am = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_bk = pid_k * BLOCK_SIZE_K + tl.arange(0, BLOCK_SIZE_K)
-    offs_n = tl.arange(0, BLOCK_SIZE_N)
-    a_ptrs = a_ptr + (offs_am[:, None] * stride_am + offs_n[None, :] * stride_ak)   # (BLOCK_SIZE_M, BLOCK_SIZE_N)
-    a_mask = (offs_am[:, None] < M)
-    # b_ptrs is set up such that it repeats elements along the K axis 8 times
-    b_ptrs = b_ptr + ((offs_bk[:, None] // infearure_per_bits) * stride_bk + offs_n[None, :] * stride_bn)   # (BLOCK_SIZE_K, BLOCK_SIZE_N)
-    g_ptrs = g_ptr + offs_bk
-    g_idx = tl.load(g_ptrs)
-    
-    # shifter is used to extract the N bits of each element in the 32-bit word from B
-    scales_ptrs = scales_ptr + offs_n[None, :]  + g_idx[:, None] * stride_scales
-    zeros_ptrs = zeros_ptr + (offs_n[None, :] // infearure_per_bits) + g_idx[:, None] * stride_zeros
-    
-    shifter = (offs_bk % infearure_per_bits) * bits
-    zeros_shifter = (offs_n % infearure_per_bits) * bits
-    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_K), dtype=tl.float32)
-    
-    for k in range(0, num_pid_n):
-        # Fetch scales and zeros; these are per-outfeature and thus reused in the inner loop
-        scales = tl.load(scales_ptrs)  # (BLOCK_SIZE_K, BLOCK_SIZE_N,)
-        zeros = tl.load(zeros_ptrs)  # (BLOCK_SIZE_K, BLOCK_SIZE_N,)
-        
-        zeros = (zeros >> zeros_shifter[None, :]) & maxq
-        zeros = (zeros + 1)
-        
-        a = tl.load(a_ptrs, mask=a_mask, other=0.)   # (BLOCK_SIZE_M, BLOCK_SIZE_N)
-        b = tl.load(b_ptrs)   # (BLOCK_SIZE_K, BLOCK_SIZE_N), but repeated
-
-        # Now we need to unpack b (which is N-bit values) into 32-bit values
-        b = (b >> shifter[:, None]) & maxq  # Extract the N-bit values
-        b = (b - zeros) * scales  # Scale and shift
-        b = tl.trans(b)
-        # ! Convert to fp16
-        b = b.to(tl.float16)
-        a = a.to(tl.float16)
-
-        accumulator += tl.dot(a, b)
-        a_ptrs += BLOCK_SIZE_N
-        b_ptrs += BLOCK_SIZE_N
-        scales_ptrs += BLOCK_SIZE_N
-        zeros_ptrs += (BLOCK_SIZE_N // infearure_per_bits)
-        
-    c = accumulator.to(tl.float16)
-    c_ptrs = c_ptr + stride_cm * offs_am[:, None] + stride_cn * offs_bk[None, :]
-    c_mask = (offs_am[:, None] < M) & (offs_bk[None, :] < K)
-    tl.store(c_ptrs, c, mask=c_mask)
-    
-    
-def triton_matmul(input, qweight, scales, qzeros, g_idx, bits, maxq):
-    assert input.shape[-1] == qweight.shape[0] * 32 // bits
-    outshape = input.shape[:-1] + (qweight.shape[1],)
-    input = input.reshape(-1, input.shape[-1])
-    output = torch.empty((input.shape[0], qweight.shape[1]), device=scales.device, dtype=torch.float16)
-    grid = lambda META: (triton.cdiv(input.shape[0], META['BLOCK_SIZE_M']) * triton.cdiv(qweight.shape[1], META['BLOCK_SIZE_N']),)
-    matmul_248_kernel[grid](input, qweight, output,
-                            scales, qzeros, g_idx,
-                            input.shape[0], qweight.shape[1], input.shape[1], bits, maxq,
-                            input.stride(0), input.stride(1),
-                            qweight.stride(0), qweight.stride(1),
-                            output.stride(0), output.stride(1),
-                            scales.stride(0), qzeros.stride(0))
-    output = output.reshape(outshape)
-    return output
-
-
-def triton_matmul_transpose(input, qweight, scales, qzeros, g_idx, bits, maxq):
-    assert input.shape[-1] == qweight.shape[1]
-    out_dim = qweight.shape[0] * 32 // bits
-    outshape = input.shape[:-1] + (out_dim,)
-    input = input.reshape(-1, input.shape[-1])
-    output_shape_mid = (input.shape[0], out_dim)
-    output = torch.empty((output_shape_mid[0], output_shape_mid[1]), device=scales.device, dtype=torch.float16)
-    grid = lambda META: (triton.cdiv(input.shape[0], META['BLOCK_SIZE_M']) * triton.cdiv(output_shape_mid[1], META['BLOCK_SIZE_K']),)
-    trans_matmul_248_kernel[grid](input, qweight, output,
-                            scales, qzeros, g_idx,
-                            input.shape[0], qweight.shape[1], output_shape_mid[1], bits, maxq,
-                            input.stride(0), input.stride(1),
-                            qweight.stride(0), qweight.stride(1),
-                            output.stride(0), output.stride(1),
-                            scales.stride(0), qzeros.stride(0))
-    output = output.reshape(outshape)
-    return output