tnwei · tnwei · Dec 18, 2021 · Dec 10, 2021 · Dec 10, 2021 · Dec 10, 2021
diff --git a/app.py b/app.py
@@ -21,6 +21,16 @@
 from omegaconf import OmegaConf
 import imageio
 import numpy as np
+
+# Catch import issue, introduced in version 1.1
+# Deprecate in a few minor versions
+try:
+    import cv2
+except ModuleNotFoundError:
+    st.warning(
+        "Version 1.1 onwards requires opencv. Please update your Python environment as defined in `environment.yml`"
+    )
+
 from logic import VQGANCLIPRun
 
 # Optional
@@ -44,6 +54,12 @@ def generate_image(
     mse_weight_decay: float = 0,
     mse_weight_decay_steps: int = 0,
     tv_loss_weight: float = 1e-3,
+    use_scrolling_zooming: bool = False,
+    translation_x: int = 0,
+    translation_y: int = 0,
+    rotation_angle: float = 0,
+    zoom_factor: float = 1,
+    transform_interval: int = 10,
     use_cutout_augmentations: bool = True,
 ) -> None:
 
@@ -62,6 +78,12 @@ def generate_image(
         mse_weight_decay=mse_weight_decay,
         mse_weight_decay_steps=mse_weight_decay_steps,
         tv_loss_weight=tv_loss_weight,
+        use_scrolling_zooming=use_scrolling_zooming,
+        translation_x=translation_x,
+        translation_y=translation_y,
+        rotation_angle=rotation_angle,
+        zoom_factor=zoom_factor,
+        transform_interval=transform_interval,
         use_cutout_augmentations=use_cutout_augmentations,
     )
 
@@ -191,6 +213,16 @@ def generate_image(
             "tv_loss_weight": tv_loss_weight,
         }
 
+        if use_scrolling_zooming:
+            details.update(
+                {
+                    "translation_x": translation_x,
+                    "translation_y": translation_y,
+                    "rotation_angle": rotation_angle,
+                    "zoom_factor": zoom_factor,
+                    "transform_interval": transform_interval,
+                }
+            )
         if use_cutout_augmentations:
             details["use_cutout_augmentations"] = True
 
@@ -262,6 +294,16 @@ def generate_image(
             "tv_loss_weight": tv_loss_weight,
         }
 
+        if use_scrolling_zooming:
+            details.update(
+                {
+                    "translation_x": translation_x,
+                    "translation_y": translation_y,
+                    "rotation_angle": rotation_angle,
+                    "zoom_factor": zoom_factor,
+                    "transform_interval": transform_interval,
+                }
+            )
         if use_cutout_augmentations:
             details["use_cutout_augmentations"] = True
 
@@ -461,6 +503,53 @@ def generate_image(
         else:
             tv_loss_weight = 0
 
+        use_scrolling_zooming = st.sidebar.checkbox(
+            "Scrolling/zooming transforms",
+            value=False,
+            help="At fixed intervals, move the generated image up/down/left/right or zoom in/out",
+        )
+        translation_x_widget = st.sidebar.empty()
+        translation_y_widget = st.sidebar.empty()
+        rotation_angle_widget = st.sidebar.empty()
+        zoom_factor_widget = st.sidebar.empty()
+        transform_interval_widget = st.sidebar.empty()
+        if use_scrolling_zooming is True:
+            translation_x = translation_x_widget.number_input(
+                "Translation in X", value=0, min_value=0, step=1
+            )
+            translation_y = translation_y_widget.number_input(
+                "Translation in y", value=0, min_value=0, step=1
+            )
+            rotation_angle = rotation_angle_widget.number_input(
+                "Rotation angle (degrees)",
+                value=0.0,
+                min_value=0.0,
+                max_value=360.0,
+                step=0.05,
+                format="%.2f",
+            )
+            zoom_factor = zoom_factor_widget.number_input(
+                "Zoom factor",
+                value=1.0,
+                min_value=0.1,
+                max_value=10.0,
+                step=0.02,
+                format="%.2f",
+            )
+            transform_interval = transform_interval_widget.number_input(
+                "Iterations per frame",
+                value=10,
+                min_value=0,
+                step=1,
+                help="Note: Will multiply by num steps above!",
+            )
+        else:
+            translation_x = 0
+            translation_y = 0
+            rotation_angle = 0
+            zoom_factor = 1
+            transform_interval = 1
+
         use_cutout_augmentations = st.sidebar.checkbox(
             "Use cutout augmentations",
             value=True,
@@ -533,7 +622,14 @@ def generate_image(
             mse_weight=mse_weight,
             mse_weight_decay=mse_weight_decay,
             mse_weight_decay_steps=mse_weight_decay_steps,
+            use_scrolling_zooming=use_scrolling_zooming,
+            translation_x=translation_x,
+            translation_y=translation_y,
+            rotation_angle=rotation_angle,
+            zoom_factor=zoom_factor,
+            transform_interval=transform_interval,
             use_cutout_augmentations=use_cutout_augmentations,
         )
+
         vid_display_slot.video("temp.mp4")
         # debug_slot.write(st.session_state) # DEBUG
diff --git a/docs/images/translationx_example_trimmed.gif b/docs/images/translationx_example_trimmed.gif
diff --git a/docs/tips-n-tricks.md b/docs/tips-n-tricks.md
@@ -31,3 +31,12 @@ A few things to take note:
 + If a new image size is specified, the existing output image will be cropped to size accordingly.
 + This is specifically possible for VQGAN-CLIP but not for CLIP guided diffusion. (Explain how both of them work)
 + Splitting a long run into multiple successive runs using the same prompt do not yield the same outcome due to the underlying stochasticity. This randomness can't be mitigated by setting the random seed alone. See the section on reproducibility in notes-and-observations.md. 
+
+## Scrolling and zooming
+
+Added scrolling and zooming from [this notebook](https://colab.research.google.com/github/chigozienri/VQGAN-CLIP-animations/blob/main/VQGAN-CLIP-animations.ipynb) by @chigozienri. 
+
+![Beautiful swirling wind, trending on ArtStation](images/translationx_example_trimmed.gif)
+
+More examples at [this imgur link](https://imgur.com/a/8pyUNCQ).
+
diff --git a/environment.yml b/environment.yml
@@ -4,9 +4,8 @@ channels:
   - conda-forge
   - defaults
 dependencies:
-  - pytorch::pytorch=1.8.1
-  - pytorch::torchvision=0.9.1
-  - pytorch::torchaudio=0.8.1
+  - pytorch::pytorch=1.10.0
+  - pytorch::torchvision=0.11.1
   - cudatoolkit=10.2
   - omegaconf
   - pytorch-lightning
@@ -26,6 +25,7 @@ dependencies:
   - flask
   - pip
   - gitpython
+  - opencv
   - pip:
     # - stegano
     # - python-xmp-toolkit

diff --git a/logic.py b/logic.py
@@ -18,6 +18,8 @@
 from torch.nn import functional as F
 from torch import optim
 from torchvision import transforms
+import cv2
+import numpy as np
 import kornia.augmentation as K
 
 
@@ -72,12 +74,18 @@ def __init__(
         mse_weight_decay=0.1,
         mse_weight_decay_steps=50,
         tv_loss_weight=1e-3,
-        use_cutout_augmentations: bool = True
+        use_cutout_augmentations: bool = True,
         # use_augs: bool = True,
         # noise_fac: float = 0.1,
         # use_noise: Optional[float] = None,
         # mse_withzeros=True,
         ## **kwargs,  # Use this to receive Streamlit objects ## Call from main UI
+        use_scrolling_zooming: bool = False,
+        translation_x: int = 0,
+        translation_y: int = 0,
+        rotation_angle: float = 0,
+        zoom_factor: float = 1,
+        transform_interval: int = 10,
     ) -> None:
         super().__init__()
         self.text_input = text_input
@@ -137,6 +145,13 @@ def __init__(
         # For TV loss
         self.tv_loss_weight = tv_loss_weight
 
+        self.use_scrolling_zooming = use_scrolling_zooming
+        self.translation_x = translation_x
+        self.translation_y = translation_y
+        self.rotation_angle = rotation_angle
+        self.zoom_factor = zoom_factor
+        self.transform_interval = transform_interval
+
     def load_model(
         self, prev_model: nn.Module = None, prev_perceptor: nn.Module = None
     ) -> Optional[Tuple[nn.Module, nn.Module]]:
@@ -298,26 +313,89 @@ def _ascend_txt(self) -> List:
         return result
 
     def iterate(self) -> Tuple[List[float], Image.Image]:
-        # Forward prop
-        self.opt.zero_grad()
-        losses = self._ascend_txt()
+        if not self.use_scrolling_zooming:
+            # Forward prop
+            self.opt.zero_grad()
+            losses = self._ascend_txt()
+
+            # Grab an image
+            im: Image.Image = checkin(self.model, self.z)
+
+            # Backprop
+            loss = sum([j for i, j in losses.items()])
+            loss.backward()
+            self.opt.step()
+            with torch.no_grad():
+                self.z.copy_(self.z.maximum(self.z_min).minimum(self.z_max))
+
+            # Advance iteration counter
+            self.iterate_counter += 1
 
-        # Grab an image
-        im: Image.Image = checkin(self.model, self.z)
+            print(
+                f"Step {self.iterate_counter} losses: {[(i, j.item()) for i, j in losses.items()]}"
+            )
 
-        # Backprop
-        loss = sum([j for i, j in losses.items()])
-        loss.backward()
-        self.opt.step()
-        with torch.no_grad():
-            self.z.copy_(self.z.maximum(self.z_min).minimum(self.z_max))
+            # Output stuff useful for humans
+            return [(i, j.item()) for i, j in losses.items()], im
 
-        # Advance iteration counter
-        self.iterate_counter += 1
+        else:
+            # Grab current image
+            im_before_transform: Image.Image = checkin(self.model, self.z)
 
-        print(
-            f"Step {self.iterate_counter} losses: {[(i, j.item()) for i, j in losses.items()]}"
-        )
+            # Convert for use in OpenCV
+            imarr = np.array(im_before_transform)
+            imarr = cv2.cvtColor(imarr, cv2.COLOR_RGB2BGR)
+
+            translation = np.float32(
+                [[1, 0, self.translation_x], [0, 1, self.translation_y]]
+            )
+
+            imcenter = (imarr.shape[1] // 2, imarr.shape[0] // 2)
+            rotation = cv2.getRotationMatrix2D(
+                imcenter, angle=self.rotation_angle, scale=self.zoom_factor
+            )
+
+            trans_mat = np.vstack([translation, [0, 0, 1]])
+            rot_mat = np.vstack([rotation, [0, 0, 1]])
+            transformation_matrix = np.matmul(rot_mat, trans_mat)
+
+            outarr = cv2.warpPerspective(
+                imarr,
+                transformation_matrix,
+                (imarr.shape[1], imarr.shape[0]),
+                borderMode=cv2.BORDER_WRAP,
+            )
+
+            transformed_im = Image.fromarray(cv2.cvtColor(outarr, cv2.COLOR_BGR2RGB))
+
+            # Encode as z, reinit
+            self.z, *_ = self.model.encode(
+                TF.to_tensor(transformed_im).to(self.device).unsqueeze(0) * 2 - 1
+            )
+            self.z.requires_grad_(True)
+            self.opt = optim.Adam([self.z], lr=self.args.step_size)
+
+            for _ in range(self.transform_interval):
+                # Forward prop
+                self.opt.zero_grad()
+                losses = self._ascend_txt()
+
+                # Grab an image
+                im: Image.Image = checkin(self.model, self.z)
+
+                # Backprop
+                loss = sum([j for i, j in losses.items()])
+                loss.backward()
+                self.opt.step()
+                with torch.no_grad():
+                    self.z.copy_(self.z.maximum(self.z_min).minimum(self.z_max))
+
+            # Advance iteration counter
+            self.iterate_counter += 1
+
+            print(
+                f"Step {self.iterate_counter} losses: {[(i, j.item()) for i, j in losses.items()]}"
+            )
 
-        # Output stuff useful for humans
-        return [(i, j.item()) for i, j in losses.items()], im
+            # Output stuff useful for humans
+            return [(i, j.item()) for i, j in losses.items()], im