Skip to content

Commit

Permalink
Putting the deprecation notice on bnb (8bit).
Browse files Browse the repository at this point in the history
  • Loading branch information
Narsil committed Sep 27, 2023
1 parent 8fe8cdb commit 3a71b0c
Show file tree
Hide file tree
Showing 4 changed files with 34 additions and 12 deletions.
8 changes: 4 additions & 4 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

28 changes: 22 additions & 6 deletions launcher/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,29 @@ mod env_runtime;

#[derive(Clone, Copy, Debug, ValueEnum)]
enum Quantization {
/// 4 bit quantization. Requires a specific GTPQ quantized model:
/// https://hf.co/models?search=awq.
/// Should replace GPTQ models whereever possible because of the better latency
Awq,
/// 8 bit quantization, doesn't require specific model.
/// Should be a drop-in replacement to bitsandbytes with much better performance.
/// Kernels are from https://github.com/NetEase-FuXi/EETQ.git
Eetq,
/// 4 bit quantization. Requires a specific GTPQ quantized model: https://hf.co/models?search=gptq.
/// text-generation-inference will use exllama (faster) kernels whereever possible, and use
/// triton kernel (wider support) when it's not.
/// AWQ has faster kernels.
Gptq,
/// Bitsandbytes 8bit. Can be applied on any model, will cut the memory requirement in half,
/// but it is known that the model will be much slower to run than the native f16.
#[deprecated(since="1.1.0", note="Use `eetq` instead, which provides better latencies overall and is drop-in in most cases")]
Bitsandbytes,
/// Bitsandbytes 4bit. Can be applied on any model, will cut the memory requirement by 4x,
/// but it is known that the model will be much slower to run than the native f16.
BitsandbytesNF4,
/// Bitsandbytes 4bit. nf4 should be preferred in most cases but maybe this one has better
/// perplexity performance for you model
BitsandbytesFP4,
Gptq,
Awq,
Eetq,
}

impl std::fmt::Display for Quantization {
Expand All @@ -47,6 +64,7 @@ impl std::fmt::Display for Quantization {
}
Quantization::Awq => {
write!(f, "awq")
}
Quantization::Eetq => {
write!(f, "eetq")
}
Expand Down Expand Up @@ -130,9 +148,7 @@ struct Args {
#[clap(long, env)]
num_shard: Option<usize>,

/// Whether you want the model to be quantized. This will use `bitsandbytes` for
/// quantization on the fly, or `gptq`. 4bit quantization is available through
/// `bitsandbytes` by providing the `bitsandbytes-fp4` or `bitsandbytes-nf4` options.
/// Whether you want the model to be quantized.
#[clap(long, env, value_enum)]
quantize: Option<Quantization>,

Expand Down
1 change: 1 addition & 0 deletions server/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -160,3 +160,4 @@ flash-attention/
flash-attention-v2/
vllm/
llm-awq/
eetq/
9 changes: 7 additions & 2 deletions server/text_generation_server/utils/layers.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
from torch import nn
from torch.nn import functional as F
from typing import List
from loguru import logger
from functools import lru_cache

HAS_BITS_AND_BYTES = True
try:
Expand Down Expand Up @@ -242,6 +244,10 @@ def forward(self, x: torch.Tensor):
return out


@lru_cache(1)
def warn_deprecate_bnb():
logger.warning("Bitsandbytes 8bit is deprecated, using `eetq` is a drop-in replacement, and has much better performnce")

def get_linear(weight, bias, quantize):
if quantize is None:
linear = FastLinear(weight, bias)
Expand All @@ -251,8 +257,7 @@ def get_linear(weight, bias, quantize):
else:
raise ImportError("Please install EETQ from https://github.com/NetEase-FuXi/EETQ")
elif quantize == "bitsandbytes":
import warnings
warnings.warn("Bitsandbytes 8bit is deprecated, using `eetq` is a drop-in replacement, and has much better performnce", DeprecationWarning)
warn_deprecate_bnb()
linear = Linear8bitLt(
weight,
bias,
Expand Down

0 comments on commit 3a71b0c

Please sign in to comment.