feat(server): Rework model loading (huggingface#344)

# What does this PR do? Reworked the loading logic. Idea is to use cleaner loading code: - Remove need for `no_init_weights` - Remove all weird `bnb_linear` and `load_weights` and `post_load_weights`. New code layout: - New class `Weights` in charge of handling loading the weights from multiple files into appropiate tensors (potentially sharded) - TP layers now are "shells", they contain the code to know what kind of sharding we need + eventual `all_reduce`. They do not inherit from linear, but they contain some kind of Linear instead - the contained linear can be either FastLinear, BnbLinear or GPTq Linear next. - All modeling code is explictly made for sharding, process group is just no-ops for non sharded code (removes a lot of test cases) ![Screenshot from 2023-05-19 23-19-59](https://github.com/huggingface/text-generation-inference/assets/204321/9a802654-74a3-488c-87a8-073743a6143f) --------- Co-authored-by: Ubuntu <[email protected]> Co-authored-by: Ubuntu <[email protected]> Co-authored-by: OlivierDehaene <[email protected]> Co-authored-by: OlivierDehaene <[email protected]>
lmaosweqf1 · Jun 8, 2023 · abd58ff · abd58ff
1 parent 19c4182
commit abd58ff
Show file tree

Hide file tree

Showing 43 changed files with 6,794 additions and 2,781 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,4 @@
 .idea
 target
 router/tokenizer.json
+*__pycache__*
diff --git a/Dockerfile b/Dockerfile
@@ -2,6 +2,8 @@
 FROM lukemathwalker/cargo-chef:latest-rust-1.69 AS chef
 WORKDIR /usr/src
 
+ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse
+
 FROM chef as planner
 COPY Cargo.toml Cargo.toml
 COPY rust-toolchain.toml rust-toolchain.toml
@@ -98,14 +100,14 @@ COPY server/Makefile-flash-att Makefile
 RUN make build-flash-attention
 
 # Build Transformers CUDA kernels
-FROM kernel-builder as transformers-builder
+FROM kernel-builder as custom-kernels-builder
 
 WORKDIR /usr/src
 
-COPY server/Makefile-transformers Makefile
+COPY server/custom_kernels/ .
 
 # Build specific version of transformers
-RUN BUILD_EXTENSIONS="True" make build-transformers
+RUN python setup.py build
 
 # Text Generation Inference base image
 FROM nvidia/cuda:11.8.0-base-ubuntu20.04 as base
@@ -136,11 +138,10 @@ COPY --from=flash-att-builder /usr/src/flash-attention/csrc/layer_norm/build/lib
 COPY --from=flash-att-builder /usr/src/flash-attention/csrc/rotary/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
 
 # Copy build artifacts from transformers builder
-COPY --from=transformers-builder /usr/src/transformers /usr/src/transformers
-COPY --from=transformers-builder /usr/src/transformers/build/lib.linux-x86_64-cpython-39/transformers /usr/src/transformers/src/transformers
+COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-39/custom_kernels /usr/src/custom-kernels/src/custom_kernels
 
-# Install transformers dependencies
-RUN cd /usr/src/transformers && pip install -e . --no-cache-dir && pip install einops --no-cache-dir
+# Install flash-attention dependencies
+RUN pip install einops --no-cache-dir
 
 # Install server
 COPY proto proto
@@ -170,4 +171,4 @@ ENTRYPOINT ["./entrypoint.sh"]
 FROM base
 
 ENTRYPOINT ["text-generation-launcher"]
-CMD ["--json-output"]
+CMD ["--json-output"]
diff --git a/Makefile b/Makefile
@@ -1,6 +1,9 @@
 install-server:
 	cd server && make install
 
+install-custom-kernels:
+	if [ "$$BUILD_EXTENSIONS" == "True" ]; then cd server/custom_kernels && python setup.py install; else echo "Custom kernels are disabled, you need set to BUILD_EXTENSION environment variable to 'True' in order to build them. (Please read the docs, kernels might not work on all hardware)"; fi
+
 install-integration-tests:
 	cd integration-tests && pip install -r requirements.txt
 	cd clients/python && pip install .
@@ -14,7 +17,7 @@ install-launcher:
 install-benchmark:
 	cd benchmark && cargo install --path .
 
-install: install-server install-router install-launcher
+install: install-server install-router install-launcher install-custom-kernels
 
 server-dev:
 	cd server && make run-dev
@@ -52,4 +55,4 @@ run-bloom:
 	text-generation-launcher --model-id bigscience/bloom --num-shard 8 --port 8080
 
 run-bloom-quantize:
-	text-generation-launcher --model-id bigscience/bloom --num-shard 8 --quantize --port 8080
+	text-generation-launcher --model-id bigscience/bloom --num-shard 8 --quantize --port 8080
diff --git a/integration-tests/conftest.py b/integration-tests/conftest.py
@@ -209,6 +209,7 @@ def local_launcher(
         num_shard: Optional[int] = None,
         quantize: Optional[str] = None,
         trust_remote_code: bool = False,
+        use_flash_attention: bool = True,
     ):
         port = random.randint(8000, 10_000)
         master_port = random.randint(10_000, 20_000)
@@ -240,6 +241,9 @@ def local_launcher(
         env = os.environ
         env["LOG_LEVEL"] = "info,text_generation_router=debug"
 
+        if not use_flash_attention:
+            env["USE_FLASH_ATTENTION"] = "false"
+
         with subprocess.Popen(
             args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=env
         ) as process:
@@ -254,12 +258,16 @@ def local_launcher(
             process.stdout.close()
             process.stderr.close()
 
+        if not use_flash_attention:
+            del env["USE_FLASH_ATTENTION"]
+
     @contextlib.contextmanager
     def docker_launcher(
         model_id: str,
         num_shard: Optional[int] = None,
         quantize: Optional[str] = None,
         trust_remote_code: bool = False,
+        use_flash_attention: bool = True,
     ):
         port = random.randint(8000, 10_000)
 
@@ -287,6 +295,9 @@ def docker_launcher(
         gpu_count = num_shard if num_shard is not None else 1
 
         env = {"LOG_LEVEL": "info,text_generation_router=debug"}
+        if not use_flash_attention:
+            env["USE_FLASH_ATTENTION"] = "false"
+
         if HUGGING_FACE_HUB_TOKEN is not None:
             env["HUGGING_FACE_HUB_TOKEN"] = HUGGING_FACE_HUB_TOKEN
 
@@ -310,6 +321,9 @@ def docker_launcher(
 
         yield ContainerLauncherHandle(client, container.name, port)
 
+        if not use_flash_attention:
+            del env["USE_FLASH_ATTENTION"]
+
         try:
             container.stop()
             container.wait()

diff --git a/...-tests/models/__snapshots__/test_flash_starcoder/test_flash_starcoder_default_params.json b/...-tests/models/__snapshots__/test_flash_starcoder/test_flash_starcoder_default_params.json
@@ -11,17 +11,17 @@
       },
       {
         "id": 1459,
-        "logprob": -5.6289062,
+        "logprob": -5.6328125,
         "text": " print"
       },
       {
         "id": 81,
-        "logprob": -1.6005859,
+        "logprob": -1.6035156,
         "text": "_"
       },
       {
         "id": 7656,
-        "logprob": -5.9921875,
+        "logprob": -5.9882812,
         "text": "hello"
       }
     ],
@@ -59,19 +59,19 @@
       },
       {
         "id": 10896,
-        "logprob": -0.3659668,
+        "logprob": -0.38549805,
         "special": false,
         "text": " World"
       },
       {
         "id": 657,
-        "logprob": -0.49804688,
+        "logprob": -0.5229492,
         "special": false,
         "text": "\")"
       },
       {
         "id": 203,
-        "logprob": -0.11279297,
+        "logprob": -0.10632324,
         "special": false,
         "text": "\n"
       },
@@ -113,7 +113,7 @@
       },
       {
         "id": 426,
-        "logprob": -0.051635742,
+        "logprob": 0.0,
         "special": false,
         "text": "name"
       },

diff --git a/integration-tests/models/__snapshots__/test_neox/test_neox.json b/integration-tests/models/__snapshots__/test_neox/test_neox.json
@@ -0,0 +1,113 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 50278,
+        "logprob": null,
+        "text": "<|USER|>"
+      },
+      {
+        "id": 1276,
+        "logprob": -4.5546875,
+        "text": "What"
+      },
+      {
+        "id": 434,
+        "logprob": -4.1992188,
+        "text": "'s"
+      },
+      {
+        "id": 634,
+        "logprob": -5.125,
+        "text": " your"
+      },
+      {
+        "id": 12315,
+        "logprob": -9.8984375,
+        "text": " mood"
+      },
+      {
+        "id": 3063,
+        "logprob": -4.0976562,
+        "text": " today"
+      },
+      {
+        "id": 32,
+        "logprob": -0.14562988,
+        "text": "?"
+      },
+      {
+        "id": 50279,
+        "logprob": -0.26733398,
+        "text": "<|ASSISTANT|>"
+      }
+    ],
+    "seed": null,
+    "tokens": [
+      {
+        "id": 42,
+        "logprob": -0.86279297,
+        "special": false,
+        "text": "I"
+      },
+      {
+        "id": 1353,
+        "logprob": -0.94921875,
+        "special": false,
+        "text": "'m"
+      },
+      {
+        "id": 7016,
+        "logprob": -2.1835938,
+        "special": false,
+        "text": " sorry"
+      },
+      {
+        "id": 13,
+        "logprob": -0.074035645,
+        "special": false,
+        "text": ","
+      },
+      {
+        "id": 1394,
+        "logprob": -0.86376953,
+        "special": false,
+        "text": "You"
+      },
+      {
+        "id": 452,
+        "logprob": -1.2070312,
+        "special": false,
+        "text": " have"
+      },
+      {
+        "id": 247,
+        "logprob": -1.4365234,
+        "special": false,
+        "text": " a"
+      },
+      {
+        "id": 4327,
+        "logprob": -1.109375,
+        "special": false,
+        "text": " choice"
+      },
+      {
+        "id": 273,
+        "logprob": -0.93408203,
+        "special": false,
+        "text": " of"
+      },
+      {
+        "id": 752,
+        "logprob": -1.8808594,
+        "special": false,
+        "text": " what"
+      }
+    ]
+  },
+  "generated_text": "I'm sorry,You have a choice of what"
+}