diff --git a/.github/azure-gpu-tests.yml b/.github/azure-gpu-tests.yml
index 07e07b1171..abcde13069 100644
--- a/.github/azure-gpu-tests.yml
+++ b/.github/azure-gpu-tests.yml
@@ -50,7 +50,7 @@ jobs:
         python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu == 2, f'GPU: {mgpu}'"
       displayName: 'Image info & NVIDIA'
 
-    - script: pip install pytest pytest-rerunfailures -r requirements.txt transformers einops
+    - script: pip install pytest pytest-rerunfailures -r requirements.txt transformers einops bitsandbytes scipy tokenizers zstandard
       displayName: 'Install dependencies'
 
     - bash: pytest -v --durations=10 --disable-pytest-warnings --strict-markers --color=yes
diff --git a/.github/workflows/cpu-tests.yml b/.github/workflows/cpu-tests.yml
index 12bf6dd9e1..2e97151557 100644
--- a/.github/workflows/cpu-tests.yml
+++ b/.github/workflows/cpu-tests.yml
@@ -38,10 +38,24 @@ jobs:
           requirements.txt
           setup.py
 
-    - name: Run tests without the package installed
+    - name: Install minimal dependencies
       run: |
         pip install --index-url https://download.pytorch.org/whl/nightly/cpu --pre torch>=2.1.0dev
-        pip install pytest pytest-rerunfailures -r requirements.txt transformers einops
+        pip install -r requirements.txt
+        pip list
+        # make sure all modules are importable
+        modules=$(
+          find * -type f -name "*.py" | \
+          grep -v tests | grep "/" | grep -v lm_eval | \
+          sed 's/\.py$//' | sed 's/\//./g' | \
+          sed 's/.__init__//g' | xargs -I {} echo "import {};"
+        )
+        echo "$modules"
+        python -c "$modules"
+
+    - name: Run tests without the package installed
+      run: |
+        pip install pytest pytest-rerunfailures transformers einops bitsandbytes scipy tokenizers zstandard
         pip list
 
         pytest --disable-pytest-warnings --strict-markers --color=yes
diff --git a/README.md b/README.md
index 0b597fb69d..b6212b459d 100644
--- a/README.md
+++ b/README.md
@@ -25,16 +25,16 @@ Hackable [implementation](lit_gpt/model.py) of state-of-the-art open-source larg
 
 Supports the following popular model checkpoints:
 
-| Model and usage                                                                                              | Reference                                                                                        |
-|--------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------|
-| Meta AI [Llama 2](tutorials/download_llama_2.md)                                                             | [Touvron et al. 2023](https://arxiv.org/abs/2307.09288)                                          |
-| Stability AI [FreeWilly2](tutorials/download_freewilly_2.md)                                                 | [Stability AI 2023](https://stability.ai/blog/stable-beluga-large-instruction-fine-tuned-models) |
-| TII UAE [Falcon](tutorials/download_falcon.md)                                                               | [TII 2023](https://falconllm.tii.ae)                                                             |
-| OpenLM Research [OpenLLaMA](tutorials/download_openllama.md)                                                 | [Geng & Liu 2023](https://github.com/openlm-research/open_llama)                                 |
-| LMSYS [Vicuna](tutorials/download_vicuna.md)                                                                 | [Li et al. 2023](https://lmsys.org/blog/2023-06-29-longchat)                                     |
-| Together [RedPajama-INCITE](tutorials/download_redpajama_incite.md)                                          | [Together 2023](https://together.ai/blog/redpajama-models-v1)                                    |
-| EleutherAI [Pythia](tutorials/download_pythia.md) | [Biderman et al. 2023](https://arxiv.org/abs/2304.01373)                                         |
-| StabilityAI [StableLM](tutorials/download_stablelm.md)                                                       | [Stability AI 2023](https://github.com/Stability-AI/StableLM)
+| Model and usage                                                     | Reference                                                                                        |
+|---------------------------------------------------------------------|--------------------------------------------------------------------------------------------------|
+| Meta AI [Llama 2](tutorials/download_llama_2.md)                    | [Touvron et al. 2023](https://arxiv.org/abs/2307.09288)                                          |
+| Stability AI [FreeWilly2](tutorials/download_freewilly_2.md)        | [Stability AI 2023](https://stability.ai/blog/stable-beluga-large-instruction-fine-tuned-models) |
+| TII UAE [Falcon](tutorials/download_falcon.md)                      | [TII 2023](https://falconllm.tii.ae)                                                             |
+| OpenLM Research [OpenLLaMA](tutorials/download_openllama.md)        | [Geng & Liu 2023](https://github.com/openlm-research/open_llama)                                 |
+| LMSYS [Vicuna](tutorials/download_vicuna.md)                        | [Li et al. 2023](https://lmsys.org/blog/2023-06-29-longchat)                                     |
+| Together [RedPajama-INCITE](tutorials/download_redpajama_incite.md) | [Together 2023](https://together.ai/blog/redpajama-models-v1)                                    |
+| EleutherAI [Pythia](tutorials/download_pythia.md)                   | [Biderman et al. 2023](https://arxiv.org/abs/2304.01373)                                         |
+| StabilityAI [StableLM](tutorials/download_stablelm.md)              | [Stability AI 2023](https://github.com/Stability-AI/StableLM)                                    |
 
 This implementation extends on [Lit-LLaMA](https://github.com/lightning-AI/lit-llama) and [nanoGPT](https://github.com/karpathy/nanoGPT), and it's **powered by [Lightning Fabric](https://lightning.ai/docs/fabric/stable/) ⚡**.
 
@@ -109,10 +109,10 @@ pip install --index-url https://download.pytorch.org/whl/nightly/cpu --pre 'torc
 MAX_JOBS=4 pip install 'flash-attn>=2.0.0.post1' --no-build-isolation
 ```
 
-All good, now install the dependencies:
+All good, now install the dependencies plus some optional ones:
 
 ```bash
-pip install -r requirements.txt
+pip install -r requirements.txt tokenizers sentencepiece
 ```
 
 You are all set! 🎉
diff --git a/quantize/gptq.py b/quantize/gptq.py
index 2f2c7f62c6..fbbb0ee28b 100644
--- a/quantize/gptq.py
+++ b/quantize/gptq.py
@@ -10,147 +10,169 @@
 from typing import Optional
 
 import torch
-from datasets import load_dataset
 from lightning import Fabric
 
 # support running without installing as a package
 wd = Path(__file__).parent.parent.resolve()
 sys.path.append(str(wd))
 
-import triton
-import triton.language as tl
+from lightning_utilities.core.imports import RequirementCache
 
 from lit_gpt import GPT, Config, Tokenizer
 from lit_gpt.utils import check_valid_checkpoint_dir, lazy_load
 
-
-# This is adapted from the OpenAI Triton matmul example.
-@triton.autotune(
-    configs=[
-        triton.Config(
-            {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 8}, num_stages=3, num_warps=8
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 8}, num_stages=3, num_warps=8
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 8}, num_stages=4, num_warps=4
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 8}, num_stages=4, num_warps=4
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 8}, num_stages=4, num_warps=4
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 8}, num_stages=4, num_warps=4
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 8}, num_stages=4, num_warps=4
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 8}, num_stages=4, num_warps=4
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 8}, num_stages=5, num_warps=2
-        ),
-        triton.Config(
-            {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 8}, num_stages=5, num_warps=2
-        ),
-    ],
-    key=["M", "N", "K"],
-)
-@triton.jit
-def linear_kernel_4bit_weight(
-    # Pointers to matrices
-    a_ptr,
-    b_ptr,
-    c_ptr,
-    bscales_ptr,
-    bzeros_ptr,
-    # bdequant,
-    # Matrix dimensions
-    M,
-    N,
-    K,
-    # The stride variables represent how much to increase the ptr by when moving by 1
-    # element in a particular dimension. E.g. stride_am is how much to increase a_ptr
-    # by to get the element one row down (A has M rows)
-    stride_am,
-    stride_ak,
-    stride_bk,
-    stride_bn,
-    stride_cm,
-    stride_cn,
-    # Meta-parameters
-    BLOCK_SIZE_M: tl.constexpr,
-    BLOCK_SIZE_N: tl.constexpr,
-    BLOCK_SIZE_K: tl.constexpr,
-    GROUP_SIZE_M: tl.constexpr,
-):
-    """Kernel for computing the matmul C = A x B.T.
-    A has shape (M, K), B has shape (N, K) and C has shape (M, N)
-    """
-    # -----------------------------------------------------------
-    # Map program ids `pid` to the block of C it should compute.
-    # This is done in a grouped ordering to promote L2 data reuse
-    # See above `L2 Cache Optimizations` section for details
-    pid = tl.program_id(axis=0)
-    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
-    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
-    num_pid_in_group = GROUP_SIZE_M * num_pid_n
-    group_id = pid // num_pid_in_group
-    first_pid_m = group_id * GROUP_SIZE_M
-    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
-    pid_m = first_pid_m + (pid % group_size_m)
-    pid_n = (pid % num_pid_in_group) // group_size_m
-
-    # ----------------------------------------------------------
-    # Create pointers for the first blocks of A and B.
-    # We will advance this pointer as we move in the K direction
-    # and accumulate
-    # a_ptrs is a block of [BLOCK_SIZE_M, BLOCK_SIZE_K] pointers
-    # b_ptrs is a block of [BLOCK_SIZE_K, BLOCK_SIZE_n] pointers
-    # see above `Pointer Arithmetics` section for details
-    offs_am = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_bn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    a_mask = offs_am[:, None] < M
-    b_mask = offs_bn[None, :] < N
-    offs_k = tl.arange(0, BLOCK_SIZE_K)
-    a_ptrs = a_ptr + (offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak)
-    b_ptrs = b_ptr + ((offs_k[:, None] // 2) * stride_bk + offs_bn[None, :] * stride_bn)
-
-    bscales_ptrs = bscales_ptr + offs_bn[None, :]
-    bzeros_ptrs = bzeros_ptr + offs_bn[None, :]
-
-    scale = tl.load(bscales_ptrs)
-    zero = tl.load(bzeros_ptrs)
-    # -----------------------------------------------------------
-    # Iterate to compute a block of the C matrix
-    # We accumulate into a `[BLOCK_SIZE_M, BLOCK_SIZE_N]` block
-    # of fp32 values for higher accuracy.
-    # `accumulator` will be converted back to fp16 after the loop
-    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-    for k in range(0, K, BLOCK_SIZE_K):
-        # wasteful as it is to load everything twice, my attempts at avoiding it lead to slower code
-        b12 = tl.load(b_ptrs, mask=b_mask)
-        # Note that for simplicity, we don't apply a mask in K here.
-        a = tl.load(a_ptrs, mask=a_mask).to(tl.float32)
-        b = (((b12.to(tl.uint8) >> ((offs_k[:, None] % 2) * 4)) & 0xF).to(tl.float32) - zero) * scale
-        accumulator += tl.dot(a, b)
-
-        # Advance the ptrs to the next K block
-        a_ptrs += BLOCK_SIZE_K * stride_ak
-        b_ptrs += (BLOCK_SIZE_K // 2) * stride_bk
-    c = accumulator
-
-    # -----------------------------------------------------------
-    # Write back the block of the output matrix C
-    offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    c_ptrs = c_ptr + stride_cm * offs_cm[:, None] + stride_cn * offs_cn[None, :]
-    c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N)
-    tl.store(c_ptrs, c, mask=c_mask)
+_TRITON_AVAILABLE = RequirementCache("triton")
+if _TRITON_AVAILABLE:
+    import triton
+    import triton.language as tl
+
+    # This is adapted from the OpenAI Triton matmul example.
+    @triton.autotune(
+        configs=[
+            triton.Config(
+                {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 8},
+                num_stages=3,
+                num_warps=8,
+            ),
+            triton.Config(
+                {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 8},
+                num_stages=3,
+                num_warps=8,
+            ),
+            triton.Config(
+                {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 8},
+                num_stages=4,
+                num_warps=4,
+            ),
+            triton.Config(
+                {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 8},
+                num_stages=4,
+                num_warps=4,
+            ),
+            triton.Config(
+                {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 8},
+                num_stages=4,
+                num_warps=4,
+            ),
+            triton.Config(
+                {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 8},
+                num_stages=4,
+                num_warps=4,
+            ),
+            triton.Config(
+                {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 8},
+                num_stages=4,
+                num_warps=4,
+            ),
+            triton.Config(
+                {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 8},
+                num_stages=4,
+                num_warps=4,
+            ),
+            triton.Config(
+                {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 8},
+                num_stages=5,
+                num_warps=2,
+            ),
+            triton.Config(
+                {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 8},
+                num_stages=5,
+                num_warps=2,
+            ),
+        ],
+        key=["M", "N", "K"],
+    )
+    @triton.jit
+    def linear_kernel_4bit_weight(
+        # Pointers to matrices
+        a_ptr,
+        b_ptr,
+        c_ptr,
+        bscales_ptr,
+        bzeros_ptr,
+        # bdequant,
+        # Matrix dimensions
+        M,
+        N,
+        K,
+        # The stride variables represent how much to increase the ptr by when moving by 1
+        # element in a particular dimension. E.g. stride_am is how much to increase a_ptr
+        # by to get the element one row down (A has M rows)
+        stride_am,
+        stride_ak,
+        stride_bk,
+        stride_bn,
+        stride_cm,
+        stride_cn,
+        # Meta-parameters
+        BLOCK_SIZE_M: tl.constexpr,
+        BLOCK_SIZE_N: tl.constexpr,
+        BLOCK_SIZE_K: tl.constexpr,
+        GROUP_SIZE_M: tl.constexpr,
+    ):
+        """Kernel for computing the matmul C = A x B.T.
+        A has shape (M, K), B has shape (N, K) and C has shape (M, N)
+        """
+        # -----------------------------------------------------------
+        # Map program ids `pid` to the block of C it should compute.
+        # This is done in a grouped ordering to promote L2 data reuse
+        # See above `L2 Cache Optimizations` section for details
+        pid = tl.program_id(axis=0)
+        num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
+        num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
+        num_pid_in_group = GROUP_SIZE_M * num_pid_n
+        group_id = pid // num_pid_in_group
+        first_pid_m = group_id * GROUP_SIZE_M
+        group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+        pid_m = first_pid_m + (pid % group_size_m)
+        pid_n = (pid % num_pid_in_group) // group_size_m
+
+        # ----------------------------------------------------------
+        # Create pointers for the first blocks of A and B.
+        # We will advance this pointer as we move in the K direction
+        # and accumulate
+        # a_ptrs is a block of [BLOCK_SIZE_M, BLOCK_SIZE_K] pointers
+        # b_ptrs is a block of [BLOCK_SIZE_K, BLOCK_SIZE_n] pointers
+        # see above `Pointer Arithmetics` section for details
+        offs_am = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+        offs_bn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+        a_mask = offs_am[:, None] < M
+        b_mask = offs_bn[None, :] < N
+        offs_k = tl.arange(0, BLOCK_SIZE_K)
+        a_ptrs = a_ptr + (offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak)
+        b_ptrs = b_ptr + ((offs_k[:, None] // 2) * stride_bk + offs_bn[None, :] * stride_bn)
+
+        bscales_ptrs = bscales_ptr + offs_bn[None, :]
+        bzeros_ptrs = bzeros_ptr + offs_bn[None, :]
+
+        scale = tl.load(bscales_ptrs)
+        zero = tl.load(bzeros_ptrs)
+        # -----------------------------------------------------------
+        # Iterate to compute a block of the C matrix
+        # We accumulate into a `[BLOCK_SIZE_M, BLOCK_SIZE_N]` block
+        # of fp32 values for higher accuracy.
+        # `accumulator` will be converted back to fp16 after the loop
+        accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+        for k in range(0, K, BLOCK_SIZE_K):
+            # wasteful as it is to load everything twice, my attempts at avoiding it lead to slower code
+            b12 = tl.load(b_ptrs, mask=b_mask)
+            # Note that for simplicity, we don't apply a mask in K here.
+            a = tl.load(a_ptrs, mask=a_mask).to(tl.float32)
+            b = (((b12.to(tl.uint8) >> ((offs_k[:, None] % 2) * 4)) & 0xF).to(tl.float32) - zero) * scale
+            accumulator += tl.dot(a, b)
+
+            # Advance the ptrs to the next K block
+            a_ptrs += BLOCK_SIZE_K * stride_ak
+            b_ptrs += (BLOCK_SIZE_K // 2) * stride_bk
+        c = accumulator
+
+        # -----------------------------------------------------------
+        # Write back the block of the output matrix C
+        offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+        offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+        c_ptrs = c_ptr + stride_cm * offs_cm[:, None] + stride_cn * offs_cn[None, :]
+        c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N)
+        tl.store(c_ptrs, c, mask=c_mask)
 
 
 def qlinear_4bit_weight(inp, weight, scales, zeros):
@@ -446,6 +468,8 @@ def quantize(self):
 
 
 def get_sample_data():
+    from datasets import load_dataset
+
     traindata = load_dataset(
         "allenai/c4", "allenai--c4", data_files={"train": "en/c4-train.00000-of-01024.json.gz"}, split="train"
     )
diff --git a/requirements.txt b/requirements.txt
index d7b36f3dcc..24ed80e2d9 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,8 +1,12 @@
 # torch>=2.1.0dev
 lightning @ git+https://github.com/Lightning-AI/lightning@master
-tokenizers
 jsonargparse[signatures]  # CLI
-bitsandbytes>=0.40.0  # quantize
-scipy  # TODO: remove when bnb has resolved https://github.com/TimDettmers/bitsandbytes/issues/544 and released the fix
-datasets  # quantize/gptq.py
-zstandard  # prepare_redpajama.py
+
+# other optional dependencies are
+# sentencepiece  # pythia, falcon, redpajama
+# tokenizers  #  llama-based models
+# bitsandbytes>=0.41.1  # quantize/bnb.py
+# scipy  # TODO: remove when https://github.com/TimDettmers/bitsandbytes/pull/525 is released
+# datasets  # quantize/gptq.py
+# zstandard  # scripts/prepare_redpajama.py
+# git+https://github.com/EleutherAI/lm-evaluation-harness.git@master  # eval
diff --git a/scripts/prepare_openwebtext.py b/scripts/prepare_openwebtext.py
index 0090568b97..7e709994bf 100644
--- a/scripts/prepare_openwebtext.py
+++ b/scripts/prepare_openwebtext.py
@@ -6,7 +6,6 @@
 from typing import Union
 
 import numpy as np
-from datasets import load_dataset  # huggingface datasets
 from tqdm import tqdm
 
 # support running without installing as a package
@@ -22,6 +21,8 @@ def prepare(
     seed: int = 42,
     test_size: Union[float, int, None] = 0.0005,
 ) -> None:
+    from datasets import load_dataset  # huggingface datasets
+
     destination_path.mkdir(parents=True, exist_ok=True)
 
     tokenizer = Tokenizer(checkpoint_dir)
diff --git a/setup.py b/setup.py
index 6e366d343d..e77384fd48 100644
--- a/setup.py
+++ b/setup.py
@@ -16,7 +16,6 @@
     install_requires=[
         # "torch>=2.1.0dev",
         "lightning @ git+https://github.com/Lightning-AI/lightning@master",
-        "tokenizers",
     ],
     packages=find_packages(),
     long_description=readme,
diff --git a/tutorials/download_falcon.md b/tutorials/download_falcon.md
index 186eb5e82a..0c05052ff0 100644
--- a/tutorials/download_falcon.md
+++ b/tutorials/download_falcon.md
@@ -34,6 +34,8 @@ python scripts/convert_hf_checkpoint.py --checkpoint_dir checkpoints/tiiuae/falc
 You're done! To execute the model just run:
 
 ```bash
+pip install tokenizers
+
 python generate/base.py --prompt "Hello, my name is" --checkpoint_dir checkpoints/tiiuae/falcon-7b
 ```
 
diff --git a/tutorials/download_pythia.md b/tutorials/download_pythia.md
index 0a17895425..5b76f266b0 100644
--- a/tutorials/download_pythia.md
+++ b/tutorials/download_pythia.md
@@ -45,5 +45,7 @@ python scripts/convert_hf_checkpoint.py --checkpoint_dir checkpoints/EleutherAI/
 You're done! To execute the model just run:
 
 ```bash
+pip install tokenizers
+
 python generate/base.py --prompt "Hello, my name is" --checkpoint_dir checkpoints/EleutherAI/pythia-1b
 ```
diff --git a/tutorials/download_redpajama_incite.md b/tutorials/download_redpajama_incite.md
index 7806aaa9eb..b7c6c9a8de 100644
--- a/tutorials/download_redpajama_incite.md
+++ b/tutorials/download_redpajama_incite.md
@@ -37,5 +37,7 @@ python scripts/convert_hf_checkpoint.py --checkpoint_dir checkpoints/togethercom
 You're done! To execute the model just run:
 
 ```bash
+pip install tokenizers
+
 python generate/base.py --prompt "Hello, my name is" --checkpoint_dir checkpoints/togethercomputer/RedPajama-INCITE-Base-3B-v1
 ```
diff --git a/tutorials/download_stablelm.md b/tutorials/download_stablelm.md
index cb46d1730f..a9e845671e 100644
--- a/tutorials/download_stablelm.md
+++ b/tutorials/download_stablelm.md
@@ -32,5 +32,7 @@ python scripts/convert_hf_checkpoint.py --checkpoint_dir checkpoints/stabilityai
 You're done! To execute the model just run:
 
 ```bash
+pip install tokenizers
+
 python generate/base.py --prompt "Hello, my name is" --checkpoint_dir checkpoints/stabilityai/stablelm-base-alpha-3b
 ```
diff --git a/tutorials/evaluation.md b/tutorials/evaluation.md
index dbfdc434d4..7ed7773efa 100644
--- a/tutorials/evaluation.md
+++ b/tutorials/evaluation.md
@@ -7,13 +7,9 @@ You can evaluate Lit-GPT using [EleutherAI's lm-eval](https://github.com/Eleuthe
 You need to install the `lm-eval` framework first:
 
 ```bash
-git clone https://github.com/EleutherAI/lm-evaluation-harness
-cd lm-evaluation-harness
-pip install -e .
+pip install https://github.com/EleutherAI/lm-evaluation-harness/archive/refs/heads/master.zip -U
 ```
 
-
-
 ### Evaluating Lit-GPT base models
 
 Use the following command to evaluate Lit-GPT models on all tasks in Eleuther AI's Evaluation Harness.
@@ -28,8 +24,7 @@ python eval/lm_eval_harness.py \
 
 To evaluate on LLMs on specific tasks, for example, TruthfulQA and HellaSwag, you can use the `--eval_task` flag as follows:
 
-
-```python
+```bash
 python eval/lm_eval_harness.py \
         --checkpoint_dir "checkpoints/Llama-2-7b-hf/" \
         --eval_tasks "[truthfulqa_mc,hellaswag]" \
diff --git a/tutorials/quantize.md b/tutorials/quantize.md
index a2bb9696da..f3273b88ef 100644
--- a/tutorials/quantize.md
+++ b/tutorials/quantize.md
@@ -46,6 +46,8 @@ Enabled with [bitsandbytes](https://github.com/TimDettmers/bitsandbytes). Check
 Uses the normalized float 4 (nf4) data type. This is recommended over "fp4" based on the paper's experimental results and theoretical analysis.
 
 ```bash
+pip install scipy bitsandbytes  # scipy is required until https://github.com/TimDettmers/bitsandbytes/pull/525 is released
+
 python generate/base.py --quantize bnb.nf4 --checkpoint_dir checkpoints/tiiuae/falcon-7b --precision bf16-true --max_new_tokens 256
 ...
 Time for inference 1: 8.92 sec total, 28.69 tokens/sec
@@ -60,6 +62,8 @@ Enabled with [bitsandbytes](https://github.com/TimDettmers/bitsandbytes). Check
 In average, this amounts to about 0.37 bits per parameter (approximately 3 GB for a 65B model).
 
 ```bash
+pip install scipy bitsandbytes  # scipy is required until https://github.com/TimDettmers/bitsandbytes/pull/525 is released
+
 python generate/base.py --quantize bnb.nf4-dq --checkpoint_dir checkpoints/tiiuae/falcon-7b --precision bf16-true --max_new_tokens 256
 ...
 Time for inference 1: 12.06 sec total, 21.23 tokens/sec
@@ -73,6 +77,8 @@ Enabled with [bitsandbytes](https://github.com/TimDettmers/bitsandbytes). Check
 Uses pure FP4 quantization.
 
 ```bash
+pip install scipy bitsandbytes  # scipy is required until https://github.com/TimDettmers/bitsandbytes/pull/525 is released
+
 python generate/base.py --quantize bnb.fp4 --checkpoint_dir checkpoints/tiiuae/falcon-7b --precision bf16-true --max_new_tokens 256
 ...
 Time for inference 1: 9.20 sec total, 27.83 tokens/sec
@@ -87,6 +93,8 @@ Enabled with [bitsandbytes](https://github.com/TimDettmers/bitsandbytes). Check
 In average, this amounts to about 0.37 bits per parameter (approximately 3 GB for a 65B model).
 
 ```bash
+pip install scipy bitsandbytes  # scipy is required until https://github.com/TimDettmers/bitsandbytes/pull/525 is released
+
 python generate/base.py --quantize bnb.fp4-dq --checkpoint_dir checkpoints/tiiuae/falcon-7b --precision bf16-true --max_new_tokens 256
 ...
 Time for inference 1: 12.12 sec total, 21.13 tokens/sec
@@ -98,6 +106,8 @@ Memory used: 5.37 GB
 Enabled with [bitsandbytes](https://github.com/TimDettmers/bitsandbytes). Check out the [paper](https://arxiv.org/abs/2110.02861) to learn more about how it works.
 
 ```bash
+pip install scipy bitsandbytes  # scipy is required until https://github.com/TimDettmers/bitsandbytes/pull/525 is released
+
 python generate/base.py --quantize bnb.int8 --checkpoint_dir checkpoints/tiiuae/falcon-7b --precision bf16-true --max_new_tokens 256
 ...
 Time for inference 1: 24.17 sec total, 10.59 tokens/sec
@@ -111,6 +121,8 @@ Check out the [paper](https://arxiv.org/abs/2210.17323) to learn more about how
 This technique needs a conversion of the weights first:
 
 ```bash
+pip install datasets
+
 python quantize/gptq.py --precision bf16-true --checkpoint_dir checkpoints/tiiuae/falcon-7b
 ...
 Time for quantization: 850.25 sec total