Putting the deprecation notice on bnb (8bit).

huggingface · Sep 27, 2023 · 3a71b0c · 3a71b0c
1 parent 8fe8cdb
commit 3a71b0c
Show file tree

Hide file tree

Showing 4 changed files with 34 additions and 12 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/launcher/src/main.rs b/launcher/src/main.rs
@@ -21,12 +21,29 @@ mod env_runtime;
 
 #[derive(Clone, Copy, Debug, ValueEnum)]
 enum Quantization {
+    /// 4 bit quantization. Requires a specific GTPQ quantized model:
+    ///   https://hf.co/models?search=awq.
+    /// Should replace GPTQ models whereever possible because of the better latency
+    Awq,
+    /// 8 bit quantization, doesn't require specific model.
+    /// Should be a drop-in replacement to bitsandbytes with much better performance.
+    /// Kernels are from https://github.com/NetEase-FuXi/EETQ.git
+    Eetq,
+    /// 4 bit quantization. Requires a specific GTPQ quantized model: https://hf.co/models?search=gptq.
+    /// text-generation-inference will use exllama (faster) kernels whereever possible, and use
+    /// triton kernel (wider support) when it's not.
+    /// AWQ has faster kernels.
+    Gptq,
+    /// Bitsandbytes 8bit. Can be applied on any model, will cut the memory requirement in half,
+    /// but it is known that the model will be much slower to run than the native f16.
+    #[deprecated(since="1.1.0", note="Use `eetq` instead, which provides better latencies overall and is drop-in in most cases")]
     Bitsandbytes,
+    /// Bitsandbytes 4bit. Can be applied on any model, will cut the memory requirement by 4x,
+    /// but it is known that the model will be much slower to run than the native f16.
     BitsandbytesNF4,
+    /// Bitsandbytes 4bit. nf4 should be preferred in most cases but maybe this one has better
+    /// perplexity performance for you model
     BitsandbytesFP4,
-    Gptq,
-    Awq,
-    Eetq,
 }
 
 impl std::fmt::Display for Quantization {
@@ -47,6 +64,7 @@ impl std::fmt::Display for Quantization {
             }
             Quantization::Awq => {
                 write!(f, "awq")
+            }
             Quantization::Eetq => {
                 write!(f, "eetq")
             }
@@ -130,9 +148,7 @@ struct Args {
     #[clap(long, env)]
     num_shard: Option<usize>,
 
-    /// Whether you want the model to be quantized. This will use `bitsandbytes` for
-    /// quantization on the fly, or `gptq`. 4bit quantization is available through
-    /// `bitsandbytes` by providing the `bitsandbytes-fp4` or `bitsandbytes-nf4` options.
+    /// Whether you want the model to be quantized.
     #[clap(long, env, value_enum)]
     quantize: Option<Quantization>,
 

diff --git a/server/.gitignore b/server/.gitignore
@@ -160,3 +160,4 @@ flash-attention/
 flash-attention-v2/
 vllm/
 llm-awq/
+eetq/
diff --git a/server/text_generation_server/utils/layers.py b/server/text_generation_server/utils/layers.py
@@ -5,6 +5,8 @@
 from torch import nn
 from torch.nn import functional as F
 from typing import List
+from loguru import logger
+from functools import lru_cache
 
 HAS_BITS_AND_BYTES = True
 try:
@@ -242,6 +244,10 @@ def forward(self, x: torch.Tensor):
         return out
 
 
+@lru_cache(1)
+def warn_deprecate_bnb():
+    logger.warning("Bitsandbytes 8bit is deprecated, using `eetq` is a drop-in replacement, and has much better performnce")
+
 def get_linear(weight, bias, quantize):
     if quantize is None:
         linear = FastLinear(weight, bias)
@@ -251,8 +257,7 @@ def get_linear(weight, bias, quantize):
         else:
             raise ImportError("Please install EETQ from https://github.com/NetEase-FuXi/EETQ")
     elif quantize == "bitsandbytes":
-        import warnings
-        warnings.warn("Bitsandbytes 8bit is deprecated, using `eetq` is a drop-in replacement, and has much better performnce", DeprecationWarning)
+        warn_deprecate_bnb()
         linear = Linear8bitLt(
             weight,
             bias,
-Original file line number
+Diff line change
@@ Expand Up / @@ -160,3 +160,4 @@ flash-attention/ @@
     flash-attention-v2/
     vllm/
     llm-awq/
+    eetq/