Skip to content

Commit

Permalink
feat(server): Rework model loading (#344)
Browse files Browse the repository at this point in the history
# What does this PR do?

Reworked the loading logic. Idea is to use cleaner loading code:

- Remove need for `no_init_weights`
- Remove all weird `bnb_linear` and `load_weights` and
`post_load_weights`.

New code layout:

- New class `Weights` in charge of handling loading the weights from
multiple files into appropiate tensors (potentially sharded)
- TP layers now are "shells", they contain the code to know what kind of
sharding we need + eventual `all_reduce`. They do not inherit from
linear, but they contain some kind of Linear instead
- the contained linear can be either FastLinear, BnbLinear or GPTq
Linear next.
- All modeling code is explictly made for sharding, process group is
just no-ops for non sharded code (removes a lot of test cases)

![Screenshot from 2023-05-19
23-19-59](https://github.com/huggingface/text-generation-inference/assets/204321/9a802654-74a3-488c-87a8-073743a6143f)

---------

Co-authored-by: Ubuntu <[email protected]>
Co-authored-by: Ubuntu <[email protected]>
Co-authored-by: OlivierDehaene <[email protected]>
Co-authored-by: OlivierDehaene <[email protected]>
  • Loading branch information
5 people authored Jun 8, 2023
1 parent 19c4182 commit abd58ff
Show file tree
Hide file tree
Showing 43 changed files with 6,794 additions and 2,781 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
.idea
target
router/tokenizer.json
*__pycache__*
17 changes: 9 additions & 8 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
FROM lukemathwalker/cargo-chef:latest-rust-1.69 AS chef
WORKDIR /usr/src

ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse

FROM chef as planner
COPY Cargo.toml Cargo.toml
COPY rust-toolchain.toml rust-toolchain.toml
Expand Down Expand Up @@ -98,14 +100,14 @@ COPY server/Makefile-flash-att Makefile
RUN make build-flash-attention

# Build Transformers CUDA kernels
FROM kernel-builder as transformers-builder
FROM kernel-builder as custom-kernels-builder

WORKDIR /usr/src

COPY server/Makefile-transformers Makefile
COPY server/custom_kernels/ .

# Build specific version of transformers
RUN BUILD_EXTENSIONS="True" make build-transformers
RUN python setup.py build

# Text Generation Inference base image
FROM nvidia/cuda:11.8.0-base-ubuntu20.04 as base
Expand Down Expand Up @@ -136,11 +138,10 @@ COPY --from=flash-att-builder /usr/src/flash-attention/csrc/layer_norm/build/lib
COPY --from=flash-att-builder /usr/src/flash-attention/csrc/rotary/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages

# Copy build artifacts from transformers builder
COPY --from=transformers-builder /usr/src/transformers /usr/src/transformers
COPY --from=transformers-builder /usr/src/transformers/build/lib.linux-x86_64-cpython-39/transformers /usr/src/transformers/src/transformers
COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-39/custom_kernels /usr/src/custom-kernels/src/custom_kernels

# Install transformers dependencies
RUN cd /usr/src/transformers && pip install -e . --no-cache-dir && pip install einops --no-cache-dir
# Install flash-attention dependencies
RUN pip install einops --no-cache-dir

# Install server
COPY proto proto
Expand Down Expand Up @@ -170,4 +171,4 @@ ENTRYPOINT ["./entrypoint.sh"]
FROM base

ENTRYPOINT ["text-generation-launcher"]
CMD ["--json-output"]
CMD ["--json-output"]
7 changes: 5 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
install-server:
cd server && make install

install-custom-kernels:
if [ "$$BUILD_EXTENSIONS" == "True" ]; then cd server/custom_kernels && python setup.py install; else echo "Custom kernels are disabled, you need set to BUILD_EXTENSION environment variable to 'True' in order to build them. (Please read the docs, kernels might not work on all hardware)"; fi

install-integration-tests:
cd integration-tests && pip install -r requirements.txt
cd clients/python && pip install .
Expand All @@ -14,7 +17,7 @@ install-launcher:
install-benchmark:
cd benchmark && cargo install --path .

install: install-server install-router install-launcher
install: install-server install-router install-launcher install-custom-kernels

server-dev:
cd server && make run-dev
Expand Down Expand Up @@ -52,4 +55,4 @@ run-bloom:
text-generation-launcher --model-id bigscience/bloom --num-shard 8 --port 8080

run-bloom-quantize:
text-generation-launcher --model-id bigscience/bloom --num-shard 8 --quantize --port 8080
text-generation-launcher --model-id bigscience/bloom --num-shard 8 --quantize --port 8080
14 changes: 14 additions & 0 deletions integration-tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,7 @@ def local_launcher(
num_shard: Optional[int] = None,
quantize: Optional[str] = None,
trust_remote_code: bool = False,
use_flash_attention: bool = True,
):
port = random.randint(8000, 10_000)
master_port = random.randint(10_000, 20_000)
Expand Down Expand Up @@ -240,6 +241,9 @@ def local_launcher(
env = os.environ
env["LOG_LEVEL"] = "info,text_generation_router=debug"

if not use_flash_attention:
env["USE_FLASH_ATTENTION"] = "false"

with subprocess.Popen(
args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=env
) as process:
Expand All @@ -254,12 +258,16 @@ def local_launcher(
process.stdout.close()
process.stderr.close()

if not use_flash_attention:
del env["USE_FLASH_ATTENTION"]

@contextlib.contextmanager
def docker_launcher(
model_id: str,
num_shard: Optional[int] = None,
quantize: Optional[str] = None,
trust_remote_code: bool = False,
use_flash_attention: bool = True,
):
port = random.randint(8000, 10_000)

Expand Down Expand Up @@ -287,6 +295,9 @@ def docker_launcher(
gpu_count = num_shard if num_shard is not None else 1

env = {"LOG_LEVEL": "info,text_generation_router=debug"}
if not use_flash_attention:
env["USE_FLASH_ATTENTION"] = "false"

if HUGGING_FACE_HUB_TOKEN is not None:
env["HUGGING_FACE_HUB_TOKEN"] = HUGGING_FACE_HUB_TOKEN

Expand All @@ -310,6 +321,9 @@ def docker_launcher(

yield ContainerLauncherHandle(client, container.name, port)

if not use_flash_attention:
del env["USE_FLASH_ATTENTION"]

try:
container.stop()
container.wait()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,17 +11,17 @@
},
{
"id": 1459,
"logprob": -5.6289062,
"logprob": -5.6328125,
"text": " print"
},
{
"id": 81,
"logprob": -1.6005859,
"logprob": -1.6035156,
"text": "_"
},
{
"id": 7656,
"logprob": -5.9921875,
"logprob": -5.9882812,
"text": "hello"
}
],
Expand Down Expand Up @@ -59,19 +59,19 @@
},
{
"id": 10896,
"logprob": -0.3659668,
"logprob": -0.38549805,
"special": false,
"text": " World"
},
{
"id": 657,
"logprob": -0.49804688,
"logprob": -0.5229492,
"special": false,
"text": "\")"
},
{
"id": 203,
"logprob": -0.11279297,
"logprob": -0.10632324,
"special": false,
"text": "\n"
},
Expand Down Expand Up @@ -113,7 +113,7 @@
},
{
"id": 426,
"logprob": -0.051635742,
"logprob": 0.0,
"special": false,
"text": "name"
},
Expand Down
113 changes: 113 additions & 0 deletions integration-tests/models/__snapshots__/test_neox/test_neox.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
{
"details": {
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 50278,
"logprob": null,
"text": "<|USER|>"
},
{
"id": 1276,
"logprob": -4.5546875,
"text": "What"
},
{
"id": 434,
"logprob": -4.1992188,
"text": "'s"
},
{
"id": 634,
"logprob": -5.125,
"text": " your"
},
{
"id": 12315,
"logprob": -9.8984375,
"text": " mood"
},
{
"id": 3063,
"logprob": -4.0976562,
"text": " today"
},
{
"id": 32,
"logprob": -0.14562988,
"text": "?"
},
{
"id": 50279,
"logprob": -0.26733398,
"text": "<|ASSISTANT|>"
}
],
"seed": null,
"tokens": [
{
"id": 42,
"logprob": -0.86279297,
"special": false,
"text": "I"
},
{
"id": 1353,
"logprob": -0.94921875,
"special": false,
"text": "'m"
},
{
"id": 7016,
"logprob": -2.1835938,
"special": false,
"text": " sorry"
},
{
"id": 13,
"logprob": -0.074035645,
"special": false,
"text": ","
},
{
"id": 1394,
"logprob": -0.86376953,
"special": false,
"text": "You"
},
{
"id": 452,
"logprob": -1.2070312,
"special": false,
"text": " have"
},
{
"id": 247,
"logprob": -1.4365234,
"special": false,
"text": " a"
},
{
"id": 4327,
"logprob": -1.109375,
"special": false,
"text": " choice"
},
{
"id": 273,
"logprob": -0.93408203,
"special": false,
"text": " of"
},
{
"id": 752,
"logprob": -1.8808594,
"special": false,
"text": " what"
}
]
},
"generated_text": "I'm sorry,You have a choice of what"
}
Loading

0 comments on commit abd58ff

Please sign in to comment.