Skip to content

Commit

Permalink
Dev (#47)
Browse files Browse the repository at this point in the history
* add _jit_pass_eliminate_simple_arith

* disable triton reshape for vae

* fix source dist

* bump version to 0.0.12 and fix python publish

* optimize performance
  • Loading branch information
chengzeyi authored Nov 23, 2023
1 parent cefbd1c commit 83738e5
Show file tree
Hide file tree
Showing 14 changed files with 178 additions and 102 deletions.
8 changes: 7 additions & 1 deletion .github/workflows/python-publish.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,13 @@ jobs:
runs-on: ubuntu-20.04

steps:
- uses: actions/checkout@v3
- name: Recursive checkout
uses: actions/checkout@v3
with:
submodules: recursive
path: "."
# fetch-depth: 0 # for tags

- name: Set up Python
uses: actions/setup-python@v3
with:
Expand Down
8 changes: 8 additions & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
include MANIFEST.in
include LICENSE
include requirements.txt
include version.txt
recursive-include third_party *
recursive-include sfast/csrc *
prune */__pycache__
global-exclude *.o *.so *.dylib *.a .git *.pyc *.swp
14 changes: 9 additions & 5 deletions examples/optimize_lcm_pipeline.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import sys
import time
import torch
from diffusers import DiffusionPipeline
from sfast.compilers.stable_diffusion_pipeline_compiler import (
Expand Down Expand Up @@ -48,21 +49,24 @@ def load_model():

kwarg_inputs = dict(
prompt=
'(masterpiece:1,2), best quality, masterpiece, best detail face, lineart, monochrome, a beautiful girl',
'(masterpiece:1,2), best quality, masterpiece, best detail face, a beautiful girl',
height=768,
width=768,
num_inference_steps=4,
num_images_per_prompt=1,
)

# NOTE: Warm it up.
# The first call will trigger compilation and might be very slow.
# After the first call, it should be very fast.
output_image = compiled_model(**kwarg_inputs).images[0]
# The initial calls will trigger compilation and might be very slow.
# After that, it should be very fast.
for _ in range(3):
output_image = compiled_model(**kwarg_inputs).images[0]

# Let's see the second call!
# Let's see it!
# Note: Progress bar might work incorrectly due to the async nature of CUDA.
begin = time.time()
output_image = compiled_model(**kwarg_inputs).images[0]
print(f'Inference time: {time.time() - begin:.3f}s')

# Let's view it in terminal!
from sfast.utils.term_image import print_image
Expand Down
16 changes: 10 additions & 6 deletions examples/optimize_stable_diffusion_pipeline.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import sys
import time
import torch
from diffusers import (StableDiffusionPipeline,
EulerAncestralDiscreteScheduler)
Expand Down Expand Up @@ -63,7 +64,7 @@ def load_model():

kwarg_inputs = dict(
prompt=
'(masterpiece:1,2), best quality, masterpiece, best detail face, lineart, monochrome, a beautiful girl',
'(masterpiece:1,2), best quality, masterpiece, best detail face, a beautiful girl',
# NOTE: If you use SDXL, you should use a higher resolution to improve the generation quality.
height=512,
width=512,
Expand All @@ -72,15 +73,18 @@ def load_model():
)

# NOTE: Warm it up.
# The first call will trigger compilation and might be very slow.
# After the first call, it should be very fast.
output_image = compiled_model(**kwarg_inputs).images[0]
# The initial calls will trigger compilation and might be very slow.
# After that, it should be very fast.
for _ in range(3):
output_image = compiled_model(**kwarg_inputs).images[0]

# Let's see the second call!
# Let's see it!
# Note: Progress bar might work incorrectly due to the async nature of CUDA.
begin = time.time()
output_image = compiled_model(**kwarg_inputs).images[0]
print(f'Inference time: {time.time() - begin:.3f}s')

# Let's view it in terminal!
# Note: Progress bar might work incorrectly due to the async nature of CUDA.
from sfast.utils.term_image import print_image

print_image(output_image, max_width=80)
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,7 @@ def get_extensions():
"Stable Fast is an ultra lightweight performance optimization framework"
" for Hugging Fase diffuser pipelines.",
packages=find_packages(exclude=("configs", "tests*")),
# include submodules in third_party
python_requires=">=3.7",
install_requires=fetch_requirements(),
extras_require={
Expand Down
2 changes: 1 addition & 1 deletion sfast/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,4 +23,4 @@ def new_lru_cache(*args, **kwargs):

# This line will be programatically read/write by setup.py.
# Leave them at the bottom of this file and don't touch them.
__version__ = "0.0.11"
__version__ = "0.0.12"
1 change: 0 additions & 1 deletion sfast/compilers/stable_diffusion_pipeline_compiler.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import logging
import packaging.version
from dataclasses import dataclass
from typing import Union
import functools
import torch
import sfast
Expand Down
2 changes: 1 addition & 1 deletion sfast/csrc/operators/cublas/CUDABlas.cc
Original file line number Diff line number Diff line change
Expand Up @@ -761,7 +761,7 @@ void gemm_and_bias(
CuBlasLtMatmulPreference preference;
// See https://github.com/pytorch/pytorch/issues/73328 for reasoning behind
// setting this to 1M.
size_t workspaceSize = 1024 * 1024 * 4;
size_t workspaceSize = 1024 * 1024 * 16;
TORCH_CUDABLAS_CHECK(cublasLtMatmulPreferenceSetAttribute(
preference.descriptor(),
CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES,
Expand Down
2 changes: 2 additions & 0 deletions sfast/cuda/graphs.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ def dynamic_graphed_callable(*args, **kwargs):
cached_callables[key] = cached_callable
return cached_callable(*args, **kwargs)

dynamic_graphed_callable._cached = cached_callables

return dynamic_graphed_callable


Expand Down
2 changes: 1 addition & 1 deletion sfast/jit/trace_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def wrapper(*args, **kwargs):
traced_modules[key] = traced_module
return traced_module(*args, **kwargs)

wrapper._traced_modules = traced_modules
wrapper._cached = traced_modules

return wrapper

Expand Down
Loading

0 comments on commit 83738e5

Please sign in to comment.