diff --git a/.github/azure-gpu-tests.yml b/.github/azure-gpu-tests.yml index 07e07b1171..abcde13069 100644 --- a/.github/azure-gpu-tests.yml +++ b/.github/azure-gpu-tests.yml @@ -50,7 +50,7 @@ jobs: python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu == 2, f'GPU: {mgpu}'" displayName: 'Image info & NVIDIA' - - script: pip install pytest pytest-rerunfailures -r requirements.txt transformers einops + - script: pip install pytest pytest-rerunfailures -r requirements.txt transformers einops bitsandbytes scipy tokenizers zstandard displayName: 'Install dependencies' - bash: pytest -v --durations=10 --disable-pytest-warnings --strict-markers --color=yes diff --git a/.github/workflows/cpu-tests.yml b/.github/workflows/cpu-tests.yml index 12bf6dd9e1..2e97151557 100644 --- a/.github/workflows/cpu-tests.yml +++ b/.github/workflows/cpu-tests.yml @@ -38,10 +38,24 @@ jobs: requirements.txt setup.py - - name: Run tests without the package installed + - name: Install minimal dependencies run: | pip install --index-url https://download.pytorch.org/whl/nightly/cpu --pre torch>=2.1.0dev - pip install pytest pytest-rerunfailures -r requirements.txt transformers einops + pip install -r requirements.txt + pip list + # make sure all modules are importable + modules=$( + find * -type f -name "*.py" | \ + grep -v tests | grep "/" | grep -v lm_eval | \ + sed 's/\.py$//' | sed 's/\//./g' | \ + sed 's/.__init__//g' | xargs -I {} echo "import {};" + ) + echo "$modules" + python -c "$modules" + + - name: Run tests without the package installed + run: | + pip install pytest pytest-rerunfailures transformers einops bitsandbytes scipy tokenizers zstandard pip list pytest --disable-pytest-warnings --strict-markers --color=yes diff --git a/README.md b/README.md index 0b597fb69d..b6212b459d 100644 --- a/README.md +++ b/README.md @@ -25,16 +25,16 @@ Hackable [implementation](lit_gpt/model.py) of state-of-the-art open-source larg Supports the following popular model checkpoints: -| Model and usage | Reference | -|--------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------| -| Meta AI [Llama 2](tutorials/download_llama_2.md) | [Touvron et al. 2023](https://arxiv.org/abs/2307.09288) | -| Stability AI [FreeWilly2](tutorials/download_freewilly_2.md) | [Stability AI 2023](https://stability.ai/blog/stable-beluga-large-instruction-fine-tuned-models) | -| TII UAE [Falcon](tutorials/download_falcon.md) | [TII 2023](https://falconllm.tii.ae) | -| OpenLM Research [OpenLLaMA](tutorials/download_openllama.md) | [Geng & Liu 2023](https://github.com/openlm-research/open_llama) | -| LMSYS [Vicuna](tutorials/download_vicuna.md) | [Li et al. 2023](https://lmsys.org/blog/2023-06-29-longchat) | -| Together [RedPajama-INCITE](tutorials/download_redpajama_incite.md) | [Together 2023](https://together.ai/blog/redpajama-models-v1) | -| EleutherAI [Pythia](tutorials/download_pythia.md) | [Biderman et al. 2023](https://arxiv.org/abs/2304.01373) | -| StabilityAI [StableLM](tutorials/download_stablelm.md) | [Stability AI 2023](https://github.com/Stability-AI/StableLM) +| Model and usage | Reference | +|---------------------------------------------------------------------|--------------------------------------------------------------------------------------------------| +| Meta AI [Llama 2](tutorials/download_llama_2.md) | [Touvron et al. 2023](https://arxiv.org/abs/2307.09288) | +| Stability AI [FreeWilly2](tutorials/download_freewilly_2.md) | [Stability AI 2023](https://stability.ai/blog/stable-beluga-large-instruction-fine-tuned-models) | +| TII UAE [Falcon](tutorials/download_falcon.md) | [TII 2023](https://falconllm.tii.ae) | +| OpenLM Research [OpenLLaMA](tutorials/download_openllama.md) | [Geng & Liu 2023](https://github.com/openlm-research/open_llama) | +| LMSYS [Vicuna](tutorials/download_vicuna.md) | [Li et al. 2023](https://lmsys.org/blog/2023-06-29-longchat) | +| Together [RedPajama-INCITE](tutorials/download_redpajama_incite.md) | [Together 2023](https://together.ai/blog/redpajama-models-v1) | +| EleutherAI [Pythia](tutorials/download_pythia.md) | [Biderman et al. 2023](https://arxiv.org/abs/2304.01373) | +| StabilityAI [StableLM](tutorials/download_stablelm.md) | [Stability AI 2023](https://github.com/Stability-AI/StableLM) | This implementation extends on [Lit-LLaMA](https://github.com/lightning-AI/lit-llama) and [nanoGPT](https://github.com/karpathy/nanoGPT), and it's **powered by [Lightning Fabric](https://lightning.ai/docs/fabric/stable/) ⚡**. @@ -109,10 +109,10 @@ pip install --index-url https://download.pytorch.org/whl/nightly/cpu --pre 'torc MAX_JOBS=4 pip install 'flash-attn>=2.0.0.post1' --no-build-isolation ``` -All good, now install the dependencies: +All good, now install the dependencies plus some optional ones: ```bash -pip install -r requirements.txt +pip install -r requirements.txt tokenizers sentencepiece ``` You are all set! 🎉 diff --git a/quantize/gptq.py b/quantize/gptq.py index 2f2c7f62c6..fbbb0ee28b 100644 --- a/quantize/gptq.py +++ b/quantize/gptq.py @@ -10,147 +10,169 @@ from typing import Optional import torch -from datasets import load_dataset from lightning import Fabric # support running without installing as a package wd = Path(__file__).parent.parent.resolve() sys.path.append(str(wd)) -import triton -import triton.language as tl +from lightning_utilities.core.imports import RequirementCache from lit_gpt import GPT, Config, Tokenizer from lit_gpt.utils import check_valid_checkpoint_dir, lazy_load - -# This is adapted from the OpenAI Triton matmul example. -@triton.autotune( - configs=[ - triton.Config( - {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 8}, num_stages=3, num_warps=8 - ), - triton.Config( - {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 8}, num_stages=3, num_warps=8 - ), - triton.Config( - {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 8}, num_stages=4, num_warps=4 - ), - triton.Config( - {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 8}, num_stages=4, num_warps=4 - ), - triton.Config( - {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 8}, num_stages=4, num_warps=4 - ), - triton.Config( - {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 8}, num_stages=4, num_warps=4 - ), - triton.Config( - {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 8}, num_stages=4, num_warps=4 - ), - triton.Config( - {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 8}, num_stages=4, num_warps=4 - ), - triton.Config( - {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 8}, num_stages=5, num_warps=2 - ), - triton.Config( - {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 8}, num_stages=5, num_warps=2 - ), - ], - key=["M", "N", "K"], -) -@triton.jit -def linear_kernel_4bit_weight( - # Pointers to matrices - a_ptr, - b_ptr, - c_ptr, - bscales_ptr, - bzeros_ptr, - # bdequant, - # Matrix dimensions - M, - N, - K, - # The stride variables represent how much to increase the ptr by when moving by 1 - # element in a particular dimension. E.g. stride_am is how much to increase a_ptr - # by to get the element one row down (A has M rows) - stride_am, - stride_ak, - stride_bk, - stride_bn, - stride_cm, - stride_cn, - # Meta-parameters - BLOCK_SIZE_M: tl.constexpr, - BLOCK_SIZE_N: tl.constexpr, - BLOCK_SIZE_K: tl.constexpr, - GROUP_SIZE_M: tl.constexpr, -): - """Kernel for computing the matmul C = A x B.T. - A has shape (M, K), B has shape (N, K) and C has shape (M, N) - """ - # ----------------------------------------------------------- - # Map program ids `pid` to the block of C it should compute. - # This is done in a grouped ordering to promote L2 data reuse - # See above `L2 Cache Optimizations` section for details - pid = tl.program_id(axis=0) - num_pid_m = tl.cdiv(M, BLOCK_SIZE_M) - num_pid_n = tl.cdiv(N, BLOCK_SIZE_N) - num_pid_in_group = GROUP_SIZE_M * num_pid_n - group_id = pid // num_pid_in_group - first_pid_m = group_id * GROUP_SIZE_M - group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M) - pid_m = first_pid_m + (pid % group_size_m) - pid_n = (pid % num_pid_in_group) // group_size_m - - # ---------------------------------------------------------- - # Create pointers for the first blocks of A and B. - # We will advance this pointer as we move in the K direction - # and accumulate - # a_ptrs is a block of [BLOCK_SIZE_M, BLOCK_SIZE_K] pointers - # b_ptrs is a block of [BLOCK_SIZE_K, BLOCK_SIZE_n] pointers - # see above `Pointer Arithmetics` section for details - offs_am = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M) - offs_bn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) - a_mask = offs_am[:, None] < M - b_mask = offs_bn[None, :] < N - offs_k = tl.arange(0, BLOCK_SIZE_K) - a_ptrs = a_ptr + (offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak) - b_ptrs = b_ptr + ((offs_k[:, None] // 2) * stride_bk + offs_bn[None, :] * stride_bn) - - bscales_ptrs = bscales_ptr + offs_bn[None, :] - bzeros_ptrs = bzeros_ptr + offs_bn[None, :] - - scale = tl.load(bscales_ptrs) - zero = tl.load(bzeros_ptrs) - # ----------------------------------------------------------- - # Iterate to compute a block of the C matrix - # We accumulate into a `[BLOCK_SIZE_M, BLOCK_SIZE_N]` block - # of fp32 values for higher accuracy. - # `accumulator` will be converted back to fp16 after the loop - accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) - for k in range(0, K, BLOCK_SIZE_K): - # wasteful as it is to load everything twice, my attempts at avoiding it lead to slower code - b12 = tl.load(b_ptrs, mask=b_mask) - # Note that for simplicity, we don't apply a mask in K here. - a = tl.load(a_ptrs, mask=a_mask).to(tl.float32) - b = (((b12.to(tl.uint8) >> ((offs_k[:, None] % 2) * 4)) & 0xF).to(tl.float32) - zero) * scale - accumulator += tl.dot(a, b) - - # Advance the ptrs to the next K block - a_ptrs += BLOCK_SIZE_K * stride_ak - b_ptrs += (BLOCK_SIZE_K // 2) * stride_bk - c = accumulator - - # ----------------------------------------------------------- - # Write back the block of the output matrix C - offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M) - offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) - c_ptrs = c_ptr + stride_cm * offs_cm[:, None] + stride_cn * offs_cn[None, :] - c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N) - tl.store(c_ptrs, c, mask=c_mask) +_TRITON_AVAILABLE = RequirementCache("triton") +if _TRITON_AVAILABLE: + import triton + import triton.language as tl + + # This is adapted from the OpenAI Triton matmul example. + @triton.autotune( + configs=[ + triton.Config( + {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 8}, + num_stages=3, + num_warps=8, + ), + triton.Config( + {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 8}, + num_stages=3, + num_warps=8, + ), + triton.Config( + {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 8}, + num_stages=4, + num_warps=4, + ), + triton.Config( + {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 8}, + num_stages=4, + num_warps=4, + ), + triton.Config( + {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 8}, + num_stages=4, + num_warps=4, + ), + triton.Config( + {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 8}, + num_stages=4, + num_warps=4, + ), + triton.Config( + {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 8}, + num_stages=4, + num_warps=4, + ), + triton.Config( + {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 8}, + num_stages=4, + num_warps=4, + ), + triton.Config( + {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 8}, + num_stages=5, + num_warps=2, + ), + triton.Config( + {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 8}, + num_stages=5, + num_warps=2, + ), + ], + key=["M", "N", "K"], + ) + @triton.jit + def linear_kernel_4bit_weight( + # Pointers to matrices + a_ptr, + b_ptr, + c_ptr, + bscales_ptr, + bzeros_ptr, + # bdequant, + # Matrix dimensions + M, + N, + K, + # The stride variables represent how much to increase the ptr by when moving by 1 + # element in a particular dimension. E.g. stride_am is how much to increase a_ptr + # by to get the element one row down (A has M rows) + stride_am, + stride_ak, + stride_bk, + stride_bn, + stride_cm, + stride_cn, + # Meta-parameters + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, + BLOCK_SIZE_K: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, + ): + """Kernel for computing the matmul C = A x B.T. + A has shape (M, K), B has shape (N, K) and C has shape (M, N) + """ + # ----------------------------------------------------------- + # Map program ids `pid` to the block of C it should compute. + # This is done in a grouped ordering to promote L2 data reuse + # See above `L2 Cache Optimizations` section for details + pid = tl.program_id(axis=0) + num_pid_m = tl.cdiv(M, BLOCK_SIZE_M) + num_pid_n = tl.cdiv(N, BLOCK_SIZE_N) + num_pid_in_group = GROUP_SIZE_M * num_pid_n + group_id = pid // num_pid_in_group + first_pid_m = group_id * GROUP_SIZE_M + group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M) + pid_m = first_pid_m + (pid % group_size_m) + pid_n = (pid % num_pid_in_group) // group_size_m + + # ---------------------------------------------------------- + # Create pointers for the first blocks of A and B. + # We will advance this pointer as we move in the K direction + # and accumulate + # a_ptrs is a block of [BLOCK_SIZE_M, BLOCK_SIZE_K] pointers + # b_ptrs is a block of [BLOCK_SIZE_K, BLOCK_SIZE_n] pointers + # see above `Pointer Arithmetics` section for details + offs_am = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M) + offs_bn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) + a_mask = offs_am[:, None] < M + b_mask = offs_bn[None, :] < N + offs_k = tl.arange(0, BLOCK_SIZE_K) + a_ptrs = a_ptr + (offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak) + b_ptrs = b_ptr + ((offs_k[:, None] // 2) * stride_bk + offs_bn[None, :] * stride_bn) + + bscales_ptrs = bscales_ptr + offs_bn[None, :] + bzeros_ptrs = bzeros_ptr + offs_bn[None, :] + + scale = tl.load(bscales_ptrs) + zero = tl.load(bzeros_ptrs) + # ----------------------------------------------------------- + # Iterate to compute a block of the C matrix + # We accumulate into a `[BLOCK_SIZE_M, BLOCK_SIZE_N]` block + # of fp32 values for higher accuracy. + # `accumulator` will be converted back to fp16 after the loop + accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) + for k in range(0, K, BLOCK_SIZE_K): + # wasteful as it is to load everything twice, my attempts at avoiding it lead to slower code + b12 = tl.load(b_ptrs, mask=b_mask) + # Note that for simplicity, we don't apply a mask in K here. + a = tl.load(a_ptrs, mask=a_mask).to(tl.float32) + b = (((b12.to(tl.uint8) >> ((offs_k[:, None] % 2) * 4)) & 0xF).to(tl.float32) - zero) * scale + accumulator += tl.dot(a, b) + + # Advance the ptrs to the next K block + a_ptrs += BLOCK_SIZE_K * stride_ak + b_ptrs += (BLOCK_SIZE_K // 2) * stride_bk + c = accumulator + + # ----------------------------------------------------------- + # Write back the block of the output matrix C + offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M) + offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) + c_ptrs = c_ptr + stride_cm * offs_cm[:, None] + stride_cn * offs_cn[None, :] + c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N) + tl.store(c_ptrs, c, mask=c_mask) def qlinear_4bit_weight(inp, weight, scales, zeros): @@ -446,6 +468,8 @@ def quantize(self): def get_sample_data(): + from datasets import load_dataset + traindata = load_dataset( "allenai/c4", "allenai--c4", data_files={"train": "en/c4-train.00000-of-01024.json.gz"}, split="train" ) diff --git a/requirements.txt b/requirements.txt index d7b36f3dcc..24ed80e2d9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,8 +1,12 @@ # torch>=2.1.0dev lightning @ git+https://github.com/Lightning-AI/lightning@master -tokenizers jsonargparse[signatures] # CLI -bitsandbytes>=0.40.0 # quantize -scipy # TODO: remove when bnb has resolved https://github.com/TimDettmers/bitsandbytes/issues/544 and released the fix -datasets # quantize/gptq.py -zstandard # prepare_redpajama.py + +# other optional dependencies are +# sentencepiece # pythia, falcon, redpajama +# tokenizers # llama-based models +# bitsandbytes>=0.41.1 # quantize/bnb.py +# scipy # TODO: remove when https://github.com/TimDettmers/bitsandbytes/pull/525 is released +# datasets # quantize/gptq.py +# zstandard # scripts/prepare_redpajama.py +# git+https://github.com/EleutherAI/lm-evaluation-harness.git@master # eval diff --git a/scripts/prepare_openwebtext.py b/scripts/prepare_openwebtext.py index 0090568b97..7e709994bf 100644 --- a/scripts/prepare_openwebtext.py +++ b/scripts/prepare_openwebtext.py @@ -6,7 +6,6 @@ from typing import Union import numpy as np -from datasets import load_dataset # huggingface datasets from tqdm import tqdm # support running without installing as a package @@ -22,6 +21,8 @@ def prepare( seed: int = 42, test_size: Union[float, int, None] = 0.0005, ) -> None: + from datasets import load_dataset # huggingface datasets + destination_path.mkdir(parents=True, exist_ok=True) tokenizer = Tokenizer(checkpoint_dir) diff --git a/setup.py b/setup.py index 6e366d343d..e77384fd48 100644 --- a/setup.py +++ b/setup.py @@ -16,7 +16,6 @@ install_requires=[ # "torch>=2.1.0dev", "lightning @ git+https://github.com/Lightning-AI/lightning@master", - "tokenizers", ], packages=find_packages(), long_description=readme, diff --git a/tutorials/download_falcon.md b/tutorials/download_falcon.md index 186eb5e82a..0c05052ff0 100644 --- a/tutorials/download_falcon.md +++ b/tutorials/download_falcon.md @@ -34,6 +34,8 @@ python scripts/convert_hf_checkpoint.py --checkpoint_dir checkpoints/tiiuae/falc You're done! To execute the model just run: ```bash +pip install tokenizers + python generate/base.py --prompt "Hello, my name is" --checkpoint_dir checkpoints/tiiuae/falcon-7b ``` diff --git a/tutorials/download_pythia.md b/tutorials/download_pythia.md index 0a17895425..5b76f266b0 100644 --- a/tutorials/download_pythia.md +++ b/tutorials/download_pythia.md @@ -45,5 +45,7 @@ python scripts/convert_hf_checkpoint.py --checkpoint_dir checkpoints/EleutherAI/ You're done! To execute the model just run: ```bash +pip install tokenizers + python generate/base.py --prompt "Hello, my name is" --checkpoint_dir checkpoints/EleutherAI/pythia-1b ``` diff --git a/tutorials/download_redpajama_incite.md b/tutorials/download_redpajama_incite.md index 7806aaa9eb..b7c6c9a8de 100644 --- a/tutorials/download_redpajama_incite.md +++ b/tutorials/download_redpajama_incite.md @@ -37,5 +37,7 @@ python scripts/convert_hf_checkpoint.py --checkpoint_dir checkpoints/togethercom You're done! To execute the model just run: ```bash +pip install tokenizers + python generate/base.py --prompt "Hello, my name is" --checkpoint_dir checkpoints/togethercomputer/RedPajama-INCITE-Base-3B-v1 ``` diff --git a/tutorials/download_stablelm.md b/tutorials/download_stablelm.md index cb46d1730f..a9e845671e 100644 --- a/tutorials/download_stablelm.md +++ b/tutorials/download_stablelm.md @@ -32,5 +32,7 @@ python scripts/convert_hf_checkpoint.py --checkpoint_dir checkpoints/stabilityai You're done! To execute the model just run: ```bash +pip install tokenizers + python generate/base.py --prompt "Hello, my name is" --checkpoint_dir checkpoints/stabilityai/stablelm-base-alpha-3b ``` diff --git a/tutorials/evaluation.md b/tutorials/evaluation.md index dbfdc434d4..7ed7773efa 100644 --- a/tutorials/evaluation.md +++ b/tutorials/evaluation.md @@ -7,13 +7,9 @@ You can evaluate Lit-GPT using [EleutherAI's lm-eval](https://github.com/Eleuthe You need to install the `lm-eval` framework first: ```bash -git clone https://github.com/EleutherAI/lm-evaluation-harness -cd lm-evaluation-harness -pip install -e . +pip install https://github.com/EleutherAI/lm-evaluation-harness/archive/refs/heads/master.zip -U ``` - - ### Evaluating Lit-GPT base models Use the following command to evaluate Lit-GPT models on all tasks in Eleuther AI's Evaluation Harness. @@ -28,8 +24,7 @@ python eval/lm_eval_harness.py \ To evaluate on LLMs on specific tasks, for example, TruthfulQA and HellaSwag, you can use the `--eval_task` flag as follows: - -```python +```bash python eval/lm_eval_harness.py \ --checkpoint_dir "checkpoints/Llama-2-7b-hf/" \ --eval_tasks "[truthfulqa_mc,hellaswag]" \ diff --git a/tutorials/quantize.md b/tutorials/quantize.md index a2bb9696da..f3273b88ef 100644 --- a/tutorials/quantize.md +++ b/tutorials/quantize.md @@ -46,6 +46,8 @@ Enabled with [bitsandbytes](https://github.com/TimDettmers/bitsandbytes). Check Uses the normalized float 4 (nf4) data type. This is recommended over "fp4" based on the paper's experimental results and theoretical analysis. ```bash +pip install scipy bitsandbytes # scipy is required until https://github.com/TimDettmers/bitsandbytes/pull/525 is released + python generate/base.py --quantize bnb.nf4 --checkpoint_dir checkpoints/tiiuae/falcon-7b --precision bf16-true --max_new_tokens 256 ... Time for inference 1: 8.92 sec total, 28.69 tokens/sec @@ -60,6 +62,8 @@ Enabled with [bitsandbytes](https://github.com/TimDettmers/bitsandbytes). Check In average, this amounts to about 0.37 bits per parameter (approximately 3 GB for a 65B model). ```bash +pip install scipy bitsandbytes # scipy is required until https://github.com/TimDettmers/bitsandbytes/pull/525 is released + python generate/base.py --quantize bnb.nf4-dq --checkpoint_dir checkpoints/tiiuae/falcon-7b --precision bf16-true --max_new_tokens 256 ... Time for inference 1: 12.06 sec total, 21.23 tokens/sec @@ -73,6 +77,8 @@ Enabled with [bitsandbytes](https://github.com/TimDettmers/bitsandbytes). Check Uses pure FP4 quantization. ```bash +pip install scipy bitsandbytes # scipy is required until https://github.com/TimDettmers/bitsandbytes/pull/525 is released + python generate/base.py --quantize bnb.fp4 --checkpoint_dir checkpoints/tiiuae/falcon-7b --precision bf16-true --max_new_tokens 256 ... Time for inference 1: 9.20 sec total, 27.83 tokens/sec @@ -87,6 +93,8 @@ Enabled with [bitsandbytes](https://github.com/TimDettmers/bitsandbytes). Check In average, this amounts to about 0.37 bits per parameter (approximately 3 GB for a 65B model). ```bash +pip install scipy bitsandbytes # scipy is required until https://github.com/TimDettmers/bitsandbytes/pull/525 is released + python generate/base.py --quantize bnb.fp4-dq --checkpoint_dir checkpoints/tiiuae/falcon-7b --precision bf16-true --max_new_tokens 256 ... Time for inference 1: 12.12 sec total, 21.13 tokens/sec @@ -98,6 +106,8 @@ Memory used: 5.37 GB Enabled with [bitsandbytes](https://github.com/TimDettmers/bitsandbytes). Check out the [paper](https://arxiv.org/abs/2110.02861) to learn more about how it works. ```bash +pip install scipy bitsandbytes # scipy is required until https://github.com/TimDettmers/bitsandbytes/pull/525 is released + python generate/base.py --quantize bnb.int8 --checkpoint_dir checkpoints/tiiuae/falcon-7b --precision bf16-true --max_new_tokens 256 ... Time for inference 1: 24.17 sec total, 10.59 tokens/sec @@ -111,6 +121,8 @@ Check out the [paper](https://arxiv.org/abs/2210.17323) to learn more about how This technique needs a conversion of the weights first: ```bash +pip install datasets + python quantize/gptq.py --precision bf16-true --checkpoint_dir checkpoints/tiiuae/falcon-7b ... Time for quantization: 850.25 sec total