-
Notifications
You must be signed in to change notification settings - Fork 0
/
setupLoaders.py
194 lines (152 loc) · 7.68 KB
/
setupLoaders.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
import os
import csv
import h5py
from dxtbx.model import ExperimentList
from dials.array_family import flex
from isashumod import condition
import numpy as np
from IPython import embed
import numpy
import torch
import time
def process_img(img, cond_meth, useSqrt=False, lt=0, dev="cpu"):
'''
img is a numpy array
returns a tensor
'''
if not img.dtype == np.float32:
img = img.astype(np.float32)
cond_img = torch.tensor(img).unsqueeze(0)
cond_img.to(dev)
cond_img = cond_meth(cond_img) # .squeeze()
cond_img[cond_img < lt] = lt
if useSqrt:
cond_img = torch.sqrt(cond_img)
return cond_img
def addExpFiles(raw_dir, firstFiles, per_files, num_files, cond_meth, hd, writer):
i = 0
for filename in os.listdir(raw_dir):
i += 1
print(filename)
# This split may be imperfect
# If we want the first per_files*100 percent of files, break the for loop as soon as i is past a certain index
if firstFiles and i > per_files * num_files:
break
# If we want the last per_files*100 percent of files, don't load the files until i is past a certain index
if not firstFiles and i <= (1 - per_files) * num_files + 1:
continue
if filename.endswith('.expt'):
# Get the processed numpy image from the .expt file
El = ExperimentList.from_file(os.path.join(raw_dir, filename))
raw_img = El[0].imageset.get_raw_data(0)[0].as_numpy_array()
cond_img = processImage(raw_img, cond_meth)
#Get the numbber of spots from the corresponding .expt file
refl_fname = filename.replace(".expt", ".refl")
num_spot = len(flex.reflection_table.from_file(os.path.join(raw_dir, refl_fname)))
# write a row to the csv file
writer.writerow([filename, num_spot])
hd.create_dataset(filename, data=cond_img)
def addExpDirs(raw_directories, raw_big_dir, per_files, cond_meth, hd, writer):
for di in raw_directories: # for direct in raw_directiories
print(di)
# generate the path...
raw_dir = os.path.join(raw_big_dir, di) # 1.for the raw directory
num_files = len(os.listdir(raw_dir))
firstFiles = True # True if you are making the dataset from the first several files as opposed to the last several files
addExpFiles(raw_dir, firstFiles, per_files, num_files, cond_meth, hd, writer)
def mkFiles(pro_big_dir):
# Create an hdf file
hd_filename = os.path.join(pro_big_dir, "imageNameAndImage.hdf5")
hd = h5py.File(hd_filename, "w-") # The w- should cause this command to fail if the file already exists
# Create a csv file
cs_filename = os.path.join(pro_big_dir, "imageNameAndSpots.csv")
cs = open(cs_filename, 'w')
return hd, cs
def closeFiles(hd, cs, cond_meth_name, raw_big_dir):
cs.close()
hd.attrs["condition_method_name"] = cond_meth_name
hd.attrs["root folder"] = raw_big_dir
hd.close()
def setupTrainLoader(raw_big_dir, raw_directories, pro_big_dir, per_files, cond_meth_name):
hd, cs = mkFiles(pro_big_dir=pro_big_dir) #Make hdf5 and csv files
writer = csv.writer(cs)
cond_meth = getattr(condition, cond_meth_name)() #Get conditioning method
addExpDirs(raw_directories, raw_big_dir, per_files, cond_meth, hd, writer)
closeFiles(hd, cs, cond_meth_name, raw_big_dir)
def setupTestLoaders(raw_big_dir, raw_directories, pro_big_dir, per_files, cond_meth_name):
cond_meth = getattr(condition, cond_meth_name)()
for di in raw_directories: #for direct in raw_directiories
#generate the path...
raw_dir = os.path.join(raw_big_dir, di)#1.for the raw directory
pro_dir = os.path.join(pro_big_dir, di[:5] + 'Loader')#2.for the processed directory
os.mkdir(pro_dir)
hd, cs = mkFiles(pro_big_dir=pro_dir) # Make hdf5 and csv files
writer = csv.writer(cs)
num_files = len(os.listdir(raw_dir))
firstFiles = True #True if you are making the dataset from the first several files as opposed to the last several files
addExpFiles(raw_dir, firstFiles, per_files, num_files, cond_meth, hd, writer)
closeFiles(hd, cs, cond_meth_name, raw_big_dir=raw_dir)
def main():
new_loaders = 'threeMax'
all_loaders = '/mnt/tmpdata/data/isashu/garbage'
big_dir = os.path.join(all_loaders, new_loaders)
os.mkdir(big_dir)
loaderSizes =['smaller','small','big']
loaderSizeVals = [0.01, 0.1, 1]
raw_big_dir = '/mnt/tmpdata/data/isashu/exptFileDumps'
raw_train_directories = ['1.42ADump', '1.450ADump', '1.45ADump', '1.50ADump', '1.60ADump', '1.62ADump',
'1.66ADump', '1.70ADump', '1.72ADump', '1.74ADump', '1.76ADump', '1.81ADump',
'1.85ADump', '1.8ADump', '1.95ADump', '2.0ADump', '2.5ADump', '2.80ADump', '2.85ADump',
'2.90ADump', '5.40ADump']
raw_test_directories = ['1.25ADump', '3.15ADump', '5.45ADump']
cond_meth_name = "mx_gamma"
i = 0
for size in loaderSizes:
loaders = os.path.join(big_dir, size + 'Loaders')
os.mkdir(loaders)
train_loader = os.path.join(loaders, 'trainLoader')
os.mkdir(train_loader)
time_prepro(raw_big_dir=raw_big_dir,raw_directories=raw_train_directories,
per_files=loaderSizeVals[i], cond_meth_name=cond_meth_name)
setupTrainLoader(raw_big_dir=raw_big_dir,raw_directories=raw_train_directories, pro_big_dir=train_loader,
per_files=loaderSizeVals[i], cond_meth_name=cond_meth_name)
test_loader = os.path.join(loaders, 'testLoaders')
os.mkdir(test_loader)
setupTestLoaders(raw_big_dir, raw_test_directories, pro_big_dir=test_loader, per_files=loaderSizeVals[i],
cond_meth_name=cond_meth_name )
val_loader = os.path.join(loaders, 'valLoader')
os.mkdir(val_loader)
setupTrainLoader(raw_big_dir=raw_big_dir, raw_directories=raw_test_directories, pro_big_dir=val_loader,
per_files=loaderSizeVals[i], cond_meth_name=cond_meth_name)
i += 1
def time_prepro(raw_big_dir, raw_directories, per_files, cond_meth_name):
ini = time.time()
cond_meth = getattr(condition, cond_meth_name)()
for di in raw_directories: # for direct in raw_directiories
print(di)
# generate the path...
raw_dir = os.path.join(raw_big_dir, di) # 1.for the raw directory
num_files = len(os.listdir(raw_dir))
firstFiles = True # True if you are making the dataset from the first several files as opposed to the last several files
i = 0
for filename in os.listdir(raw_dir):
i += 1
print(filename)
# This split may be imperfect
# If we want the first per_files*100 percent of files, break the for loop as soon as i is past a certain index
if firstFiles and i > per_files * num_files:
break
# If we want the last per_files*100 percent of files, don't load the files until i is past a certain index
if not firstFiles and i <= (1 - per_files) * num_files + 1:
continue
if filename.endswith('.expt'):
# 4.Extract the numpy image from the experiment file
# What is the experiment list?
El = ExperimentList.from_file(os.path.join(raw_dir, filename))
raw_img = El[0].imageset.get_raw_data(0)[0].as_numpy_array()
bp = time.time()
cond_img = processImage(raw_img, cond_meth)
print('Time to process is %.6f' % (time.time() - bp))
print(f'Time is: {time.time() - ini}')
if __name__ == "__main__":
main()