-
Notifications
You must be signed in to change notification settings - Fork 1
/
transform.py
86 lines (74 loc) · 2.71 KB
/
transform.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize
import PIL
import torchvision.transforms.functional as Ft
def _convert_image_to_rgb(image):
return image.convert("RGB")
class SquarePad:
"""
Square pad the input image with zero padding
"""
def __init__(self, size: int):
"""
For having a consistent preprocess pipeline with CLIP we need to have the preprocessing output dimension as
a parameter
:param size: preprocessing output dimension
"""
self.size = size
def __call__(self, image):
w, h = image.size
max_wh = max(w, h)
hp = int((max_wh - w) / 2)
vp = int((max_wh - h) / 2)
padding = [hp, vp, hp, vp]
return Ft.pad(image, padding, 0, 'constant')
class TargetPad:
"""
Pad the image if its aspect ratio is above a target ratio.
Pad the image to match such target ratio
"""
def __init__(self, target_ratio: float, size: int):
"""
:param target_ratio: target ratio
:param size: preprocessing output dimension
"""
self.size = size
self.target_ratio = target_ratio
def __call__(self, image):
w, h = image.size
actual_ratio = max(w, h) / min(w, h)
if actual_ratio < self.target_ratio: # check if the ratio is above or below the target ratio
return image
scaled_max_wh = max(w, h) / self.target_ratio # rescale the pad to match the target ratio
hp = max(int((scaled_max_wh - w) / 2), 0)
vp = max(int((scaled_max_wh - h) / 2), 0)
padding = [hp, vp, hp, vp]
return Ft.pad(image, padding, 0, 'constant')
def squarepad_transform(dim: int):
"""
CLIP-like preprocessing transform on a square padded image
:param dim: image output dimension
:return: CLIP-like torchvision Compose transform
"""
return Compose([
SquarePad(dim),
Resize(dim, interpolation=PIL.Image.BICUBIC),
CenterCrop(dim),
_convert_image_to_rgb,
ToTensor(),
Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
])
def targetpad_transform(target_ratio: float, dim: int):
"""
CLIP-like preprocessing transform computed after using TargetPad pad
:param target_ratio: target ratio for TargetPad
:param dim: image output dimension
:return: CLIP-like torchvision Compose transform
"""
return Compose([
TargetPad(target_ratio, dim),
Resize(dim, interpolation=PIL.Image.BICUBIC),
CenterCrop(dim),
_convert_image_to_rgb,
ToTensor(),
Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
])