-
Notifications
You must be signed in to change notification settings - Fork 1
/
feature_extractor.py
90 lines (68 loc) · 2.54 KB
/
feature_extractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import os
import re
import tensorflow as tf
import tensorflow.python.platform
from tensorflow.python.platform import gfile
import numpy as np
import pandas as pd
import cPickle as pickle
"""Credit: KERNIX blog - Image classification
with a pre-trained deep neural network"""
def create_graph():
# (confirm) creates .pb graph file in imagenet directory
model_dir = 'imagenet'
with gfile.FastGFile(os.path.join(
model_dir, 'classify_image_graph_def.pb'), 'rb') as f:
graph_def = tf.GraphDef()
graph_def.ParseFromString(f.read())
_ = tf.import_graph_def(graph_def, name='')
def extract_features(in_item, save_loc=None):
'''take list of image file paths or pandas df with file_path column
return pandas series of features
(optional save as .npy file to save_loc)'''
# TODO: improve saving
# check if features.npy exists
# if exists (and is the same photo set) pickup from end
# consider putting into mongoDB?
# add functionality for list of photos
if type(in_item) == list:
list_images = in_item
if type(in_item) == pd.core.frame.DataFrame:
list_images = in_item.file_path
nb_features = 2048
features = np.empty((len(list_images), nb_features))
create_graph()
with tf.Session() as sess:
next_to_last_tensor = sess.graph.get_tensor_by_name('pool_3:0')
for ind, image in enumerate(list_images):
if (ind % 100 == 0):
print('Processing %s...' % (image))
# Save periodically
if save_loc:
np.save(save_loc, features)
if not gfile.Exists(image):
tf.logging.fatal('File does not exist %s', image)
image_data = gfile.FastGFile(image, 'rb').read()
predictions = sess.run(next_to_last_tensor,
{'DecodeJpeg/contents:0': image_data})
features[ind, :] = np.squeeze(predictions)
if save_loc is not None:
np.save(save_loc, features)
return features
# create df with tensorflow features, keywords, and file paths
# for use in data_pipline.load_df()
def feature_df(df, arr):
# take path or df/np.array object
# return df with features column attached
if type(arr) == str:
ftrs = np.load(arr)
else:
ftrs = arr
if type(df) == str:
metadata_df = pd.read_csv(df)
else:
metadata_df = df
df = pd.DataFrame(ftrs)
df['keywords'] = metadata_df.keywords
df['file_path'] = metadata_df.file_path
return df