Dev (#47)

* add _jit_pass_eliminate_simple_arith * disable triton reshape for vae * fix source dist * bump version to 0.0.12 and fix python publish * optimize performance
chengzeyi · Nov 23, 2023 · 83738e5 · 83738e5
1 parent cefbd1c
commit 83738e5
Show file tree

Hide file tree

Showing 14 changed files with 178 additions and 102 deletions.
diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml
@@ -22,7 +22,13 @@ jobs:
     runs-on: ubuntu-20.04
 
     steps:
-    - uses: actions/checkout@v3
+    - name: Recursive checkout
+      uses: actions/checkout@v3
+      with:
+        submodules: recursive
+        path: "."
+        # fetch-depth: 0 # for tags
+
     - name: Set up Python
       uses: actions/setup-python@v3
       with:

diff --git a/MANIFEST.in b/MANIFEST.in
@@ -0,0 +1,8 @@
+include MANIFEST.in
+include LICENSE
+include requirements.txt
+include version.txt
+recursive-include third_party *
+recursive-include sfast/csrc *
+prune */__pycache__
+global-exclude *.o *.so *.dylib *.a .git *.pyc *.swp
diff --git a/examples/optimize_lcm_pipeline.py b/examples/optimize_lcm_pipeline.py
@@ -1,4 +1,5 @@
 import sys
+import time
 import torch
 from diffusers import DiffusionPipeline
 from sfast.compilers.stable_diffusion_pipeline_compiler import (
@@ -48,21 +49,24 @@ def load_model():
 
 kwarg_inputs = dict(
     prompt=
-    '(masterpiece:1,2), best quality, masterpiece, best detail face, lineart, monochrome, a beautiful girl',
+    '(masterpiece:1,2), best quality, masterpiece, best detail face, a beautiful girl',
     height=768,
     width=768,
     num_inference_steps=4,
     num_images_per_prompt=1,
 )
 
 # NOTE: Warm it up.
-# The first call will trigger compilation and might be very slow.
-# After the first call, it should be very fast.
-output_image = compiled_model(**kwarg_inputs).images[0]
+# The initial calls will trigger compilation and might be very slow.
+# After that, it should be very fast.
+for _ in range(3):
+    output_image = compiled_model(**kwarg_inputs).images[0]
 
-# Let's see the second call!
+# Let's see it!
 # Note: Progress bar might work incorrectly due to the async nature of CUDA.
+begin = time.time()
 output_image = compiled_model(**kwarg_inputs).images[0]
+print(f'Inference time: {time.time() - begin:.3f}s')
 
 # Let's view it in terminal!
 from sfast.utils.term_image import print_image

diff --git a/examples/optimize_stable_diffusion_pipeline.py b/examples/optimize_stable_diffusion_pipeline.py
@@ -1,4 +1,5 @@
 import sys
+import time
 import torch
 from diffusers import (StableDiffusionPipeline,
                        EulerAncestralDiscreteScheduler)
@@ -63,7 +64,7 @@ def load_model():
 
 kwarg_inputs = dict(
     prompt=
-    '(masterpiece:1,2), best quality, masterpiece, best detail face, lineart, monochrome, a beautiful girl',
+    '(masterpiece:1,2), best quality, masterpiece, best detail face, a beautiful girl',
     # NOTE: If you use SDXL, you should use a higher resolution to improve the generation quality.
     height=512,
     width=512,
@@ -72,15 +73,18 @@ def load_model():
 )
 
 # NOTE: Warm it up.
-# The first call will trigger compilation and might be very slow.
-# After the first call, it should be very fast.
-output_image = compiled_model(**kwarg_inputs).images[0]
+# The initial calls will trigger compilation and might be very slow.
+# After that, it should be very fast.
+for _ in range(3):
+    output_image = compiled_model(**kwarg_inputs).images[0]
 
-# Let's see the second call!
+# Let's see it!
+# Note: Progress bar might work incorrectly due to the async nature of CUDA.
+begin = time.time()
 output_image = compiled_model(**kwarg_inputs).images[0]
+print(f'Inference time: {time.time() - begin:.3f}s')
 
 # Let's view it in terminal!
-# Note: Progress bar might work incorrectly due to the async nature of CUDA.
 from sfast.utils.term_image import print_image
 
 print_image(output_image, max_width=80)
diff --git a/setup.py b/setup.py
@@ -206,6 +206,7 @@ def get_extensions():
     "Stable Fast is an ultra lightweight performance optimization framework"
     " for Hugging Fase diffuser pipelines.",
     packages=find_packages(exclude=("configs", "tests*")),
+    # include submodules in third_party
     python_requires=">=3.7",
     install_requires=fetch_requirements(),
     extras_require={

diff --git a/sfast/__init__.py b/sfast/__init__.py
@@ -23,4 +23,4 @@ def new_lru_cache(*args, **kwargs):
 
 # This line will be programatically read/write by setup.py.
 # Leave them at the bottom of this file and don't touch them.
-__version__ = "0.0.11"
+__version__ = "0.0.12"
diff --git a/sfast/compilers/stable_diffusion_pipeline_compiler.py b/sfast/compilers/stable_diffusion_pipeline_compiler.py
@@ -1,7 +1,6 @@
 import logging
 import packaging.version
 from dataclasses import dataclass
-from typing import Union
 import functools
 import torch
 import sfast

diff --git a/sfast/csrc/operators/cublas/CUDABlas.cc b/sfast/csrc/operators/cublas/CUDABlas.cc
@@ -761,7 +761,7 @@ void gemm_and_bias(
   CuBlasLtMatmulPreference preference;
   // See https://github.com/pytorch/pytorch/issues/73328 for reasoning behind
   // setting this to 1M.
-  size_t workspaceSize = 1024 * 1024 * 4;
+  size_t workspaceSize = 1024 * 1024 * 16;
   TORCH_CUDABLAS_CHECK(cublasLtMatmulPreferenceSetAttribute(
       preference.descriptor(),
       CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES,

diff --git a/sfast/cuda/graphs.py b/sfast/cuda/graphs.py
@@ -31,6 +31,8 @@ def dynamic_graphed_callable(*args, **kwargs):
                     cached_callables[key] = cached_callable
         return cached_callable(*args, **kwargs)
 
+    dynamic_graphed_callable._cached = cached_callables
+
     return dynamic_graphed_callable
 
 

diff --git a/sfast/jit/trace_helper.py b/sfast/jit/trace_helper.py
@@ -55,7 +55,7 @@ def wrapper(*args, **kwargs):
                     traced_modules[key] = traced_module
         return traced_module(*args, **kwargs)
 
-    wrapper._traced_modules = traced_modules
+    wrapper._cached = traced_modules
 
     return wrapper