-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathwelcome.py
executable file
·429 lines (378 loc) · 24.5 KB
/
welcome.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import os
import requests
import datetime
# handle no _config.py file added
try:
import _config
except:
pass
from flask import Flask, render_template, request
from watson_developer_cloud import NaturalLanguageClassifierV1
from flask_table import Table, Col
from lxml import html
app = Flask(__name__)
# Required classifiers we will need and the data to train them
REQ_CLASSIFIERS = [['Product_description_Top_Level', 'product_descriptions_top_levels_tuncated.csv'],
['Product_description_Gender', 'product_descriptions_gender.csv'],
['Product_description_Health', 'product_descriptions_health.csv'],
['Product_description_Electronics', 'product_descriptions_electronics.csv'],
['Product_description_Home', 'product_descriptions_home.csv'],
['Product_description_Clothing', 'product_descriptions_clothing.csv'],
['Product_description_Apparel', 'product_descriptions_apparel.csv']
]
VCAP_SERVICES = os.getenv("VCAP_SERVICES")
if VCAP_SERVICES is not None:
# These will be automatically set if deployed to IBM Cloud
SERVICES = json.loads(VCAP_SERVICES)
NLC_USERNAME = SERVICES['natural_language_classifier'][0]['credentials']['username']
NLC_PASSWORD = SERVICES['natural_language_classifier'][0]['credentials']['password']
# OPTIONAL APP NOTIFICATIONS FROM IBM ALERT NOTIFICATION
ALERT_USERNAME = SERVICES['alertnotification'][0]['credentials']['name']
ALERT_PASSWORD = SERVICES['alertnotification'][0]['credentials']['password']
# set path when deployed to Bluemix so the same references to other folders can be made as when local
cur_path = '/home/vcap/app'
else:
# start with current path
cur_path = os.path.abspath(__file__)
#
# If you CHANGED the folder name, CHANGE the name here to match,
# otherwise the while loop will never end
#
while cur_path.split('/')[-1] != 'NLC_product_classifier-demo':
cur_path = os.path.abspath(os.path.join(cur_path, os.pardir))
try:
# Set these here for local development
NLC_USERNAME = _config.NLC_USERNAME
NLC_PASSWORD = _config.NLC_PASSWORD
# OPTIONAL APP NOTIFICATIONS FROM IBM ALERT NOTIFICATION
ALERT_USERNAME = _config.alert_user
ALERT_PASSWORD = _config.alert_password
except:
# handling for hardcoding credentials
NLC_USERNAME = ""
NLC_PASSWORD = ""
# OPTIONAL APP NOTIFICATIONS FROM IBM ALERT NOTIFICATION
ALERT_USERNAME = ''
ALERT_PASSWORD = ''
# location to data is now the same for both local and Bluemix deployment
data_folder = os.path.join(cur_path, 'data')
# initialize variables
CLASSIFIER_READY = None
CLASSIFIER_STATUS = None
ALL_CLASSIFIERS = None
NLC_SERVICE = None
@app.route('/')
def Welcome():
# call global variables
global CLASSIFIER_READY
global CLASSIFIER_STATUS
global ALL_CLASSIFIERS
global NLC_SERVICE
try:
# initiate NLC service
NLC_SERVICE = NaturalLanguageClassifierV1(
username=NLC_USERNAME,
password=NLC_PASSWORD
)
except:
# catch authentication failures and raise warning message
NLC_SERVICE = False
if NLC_SERVICE:
try:
# If the credentials are authenticated, begin training any instances not initiated and store the UUID/status of each
ALL_CLASSIFIERS, CLASSIFIER_STATUS, CLASSIFIER_READY = _init_classifiers()
# easier to check no failures than all successes
if len([data['status'] for data in ALL_CLASSIFIERS.values() if data['status'] in ['Non Existent', 'Training', 'Failed', 'Unavailable']]) == 0:
# CLASSIFIER_STATUS used both for in app error messages and also can be incorporated into flask_table to trigger HTML formatting
# by passing the value to flask_table class as an id
CLASSIFIER_STATUS = 'available'
CLASSIFIER_READY = True
elif 'Training' in [data['status'] for data in ALL_CLASSIFIERS.values()]:
CLASSIFIER_STATUS = 'training'
CLASSIFIER_READY = False
else:
CLASSIFIER_STATUS = 'unavailable'
CLASSIFIER_READY = False
# retrieve classifier information
_CLASSIFIER = [{'_name':_name,'_id':data['id'], '_status':data['status']} for _name, data in ALL_CLASSIFIERS.items()]
classifier_info = ConfigTable(_CLASSIFIER)
# update the UI, but only the classifier info box
if CLASSIFIER_READY:
# fill in the text boxes internally so they don't appear as empty when not used
return render_template('index.html', classifier_info=classifier_info)
elif CLASSIFIER_STATUS == 'training':
# Return status on any classifier instances which are still training
return render_template('index.html', classifier_info=classifier_info, error_line = 'Classifier is currently %s.' % (CLASSIFIER_STATUS), training_icon = '<img id="training_icon" src="static/images/ibm-watson.gif" alt=" " class="center"/>', scroll_script = '<script src="static/scripts/bottom_scroll.js" language="javascript" type="text/javascript"></script>')
else:
# Return status on any classifier instances which are experiencing issues
return render_template('index.html', classifier_info=classifier_info, error_line = 'Classifier is currently %s.' % (CLASSIFIER_STATUS))
except Exception as details:
# send service failure alert
_error_alerts(details, Flask(__name__), 'Fatal')
# return error page
return render_template('index.html', classifier_info=classifier_info, error_line = 'Unexpected error encountered')
else:
# Return only a message that the NLC access has not been provisioned. Adding credentials to a _config.py file,
# hardcoding them in to this script or launching this app through IBM Bluemix will all automate the training process
return render_template('index.html', error_line="Please add a _config.py file with your NLC credentials if running locally. ", scroll_script = '<script src="static/scripts/bottom_scroll.js" language="javascript" type="text/javascript"></script>')
@app.route('/classify_text', methods=['GET', 'POST'])
def classify_text():
# call global variables
global CLASSIFIER_READY
global CLASSIFIER_STATUS
global ALL_CLASSIFIERS
global NLC_SERVICE
# get the text from the UI
input_text = request.form['classifierinput_text']
# get info on our classifiers and format their statuses to HTML table
try:
# retrieve classifier information
_CLASSIFIER = [{'_name':_name,'_id':data['id'], '_status':data['status']} for _name, data in ALL_CLASSIFIERS.items()]
classifier_info = ConfigTable(_CLASSIFIER)
# catch error in retrieving classifier information
except Exception as details:
# send service warning alert
_error_alerts(details, 'classify_text', 'Warning')
try:
# rerun retrieval of classifier information
ALL_CLASSIFIERS, CLASSIFIER_STATUS, CLASSIFIER_READY = _init_classifiers()
_CLASSIFIER = [{'_name':_name,'_id':data['id'], '_status':data['status']} for _name, data in ALL_CLASSIFIERS.items()]
classifier_info = ConfigTable(_CLASSIFIER)
# catch continued error in retreiving classifier information
except Exception as details_2:
# send service failure alert
_error_alerts(details_2, 'classify_text', 'Fatal')
# return error page
return render_template('index.html', classifier_info=classifier_info, error_line = 'Unexpected error encountered retrieving NLC instances, please try reloading the home page.', scroll_script = '<script src="static/scripts/bottom_scroll.js" language="javascript" type="text/javascript"></script>')
if CLASSIFIER_READY:
# check for valid product description
try:
if input_text != '':
# retrieve classification from text
full_output = _classify(input_text)
# the main objective, a catalogue hierarchy, only using the top choices
concat_output = '-'.join([_capitalize(i['class_1']) for i in full_output])
# send results to table formatter
all_results = ResultsTable(full_output)
# fill in the text boxes internally so they don't appear as empty when not used
return render_template('index.html', classifier_info=classifier_info, classifier_input = '<textarea rows="5" cols="126"> %s </textarea>' % (input_text), all_results = all_results, error_line = concat_output, scroll_script = '<script src="static/scripts/bottom_scroll.js" language="javascript" type="text/javascript"></script>')
else:
# return a reminder that this can only handle product pages from Kohl's if an invalid url is passed
return render_template('index.html', classifier_info=classifier_info, error_line = 'Invalid Url. Please provide a product page from Kohls.com, or manually add the product description above.', scroll_script = '<script src="static/scripts/bottom_scroll.js" language="javascript" type="text/javascript"></script>')
except Exception as details:
# send service failure alert
_error_alerts(details, 'classify_text', 'Fatal')
# return error page
return render_template('index.html', classifier_info=classifier_info, error_line = 'Unexpected error encountered', scroll_script = '<script src="static/scripts/bottom_scroll.js" language="javascript" type="text/javascript"></script>')
else:
# return only classifier information in event of failure
return render_template('index.html', classifier_info=classifier_info, classifier_input = input_text, error_line = 'Classifier is currently %s.' % (CLASSIFIER_STATUS), scroll_script = '<script src="static/scripts/bottom_scroll.js" language="javascript" type="text/javascript"></script>')
@app.route('/classify_url', methods=['GET', 'POST'])
def classify_url():
global CLASSIFIER_READY
global CLASSIFIER_STATUS
global ALL_CLASSIFIERS
global NLC_SERVICE
# get info on our classifiers and format their statuses to HTML table
try:
_CLASSIFIER = [{'_name':_name,'_id':data['id'], '_status':data['status']} for _name, data in ALL_CLASSIFIERS.items()]
classifier_info = ConfigTable(_CLASSIFIER)
except Exception as details:
# send service warning alert
_error_alerts(details, 'classify_url', 'Warning')
try:
# rerun retrieval of classifier information
ALL_CLASSIFIERS, CLASSIFIER_STATUS, CLASSIFIER_READY = _init_classifiers()
_CLASSIFIER = [{'_name':_name,'_id':data['id'], '_status':data['status']} for _name, data in ALL_CLASSIFIERS.items()]
classifier_info = ConfigTable(_CLASSIFIER)
# catch continued error in retreiving classifier information
except Exception as details_2:
# send service failure alert
_error_alerts(details_2, 'classify_url', 'Fatal')
# return error page
return render_template('index.html', classifier_info=classifier_info, error_line = 'Unexpected error encountered retrieving NLC instances, please try reloading the home page.', scroll_script = '<script src="static/scripts/bottom_scroll.js" language="javascript" type="text/javascript"></script>')
# get the text from the UI
input_url = request.form['classifierinput_url']
# send url to parser
try:
input_text = _get_Kohls_url_info(input_url)
except Exception as details:
# send service failure alert
_error_alerts(details, 'get_url_text', 'Fatal')
# return error page
return render_template('index.html', classifier_info=classifier_info, error_line = 'Invalid Url. Please provide a product page from Kohls.com, or manually add the product description above.', scroll_script = '<script src="static/scripts/bottom_scroll.js" language="javascript" type="text/javascript"></script>')
if CLASSIFIER_READY:
# check for valid product description
try:
if input_text:
# retrieve classification from text
full_output = _classify(input_text)
# the main objective, a catalogue hierarchy, only using the top choices
concat_output = '-'.join([_capitalize(i['class_1']) for i in full_output])
# send results to table formatter
all_results = ResultsTable(full_output)
# fill in the text boxes internally so they don't appear as empty when not used
return render_template('index.html', classifier_info=classifier_info, classifier_input = '<textarea rows="5" cols="126"> %s </textarea>' % (input_text), all_results = all_results, error_line = concat_output, scroll_script = '<script src="static/scripts/bottom_scroll.js" language="javascript" type="text/javascript"></script>')
else:
# return a reminder that this can only handle product pages from Kohl's if an invalid url is passed
return render_template('index.html', classifier_info=classifier_info, error_line = 'Invalid Url. Please provide a product page from Kohls.com, or manually add the product description above.', scroll_script = '<script src="static/scripts/bottom_scroll.js" language="javascript" type="text/javascript"></script>')
except Exception as details:
# send service failure alert
_error_alerts(details, 'classify_url', 'Fatal')
# return error page
return render_template('index.html', classifier_info=classifier_info, error_line = 'Unexpected error encountered', scroll_script = '<script src="static/scripts/bottom_scroll.js" language="javascript" type="text/javascript"></script>')
else:
# return only classifier information in event of failure
return render_template('index.html', classifier_info=classifier_info, classifier_input = input_text, error_line = 'Classifier is currently %s.' % (CLASSIFIER_STATUS), scroll_script = '<script src="static/scripts/bottom_scroll.js" language="javascript" type="text/javascript"></script>')
class ResultsTable(Table):
# set class id and table values
table_id = 'classes'
class_1 = Col('#1 Choice')
confidence_1 = Col('#1 Confidence')
class_2 = Col('#2 Choice')
confidence_2 = Col('#2 Confidence')
class ConfigTable(Table):
# set class id and table values
table_id = 'config'
_name = Col('Name')
_id = Col('ID')
_status = Col('Status')
def _init_classifiers():
ALL_CLASSIFIERS = _create_classifier()
# easier to check no failures than all successes
if len([data['status'] for data in ALL_CLASSIFIERS.values() if data['status'] in ['Non Existent', 'Training', 'Failed', 'Unavailable']]) == 0:
# CLASSIFIER_STATUS used both for in app error messages and also can be incorporated into flask_table to trigger HTML formatting
# by passing the value to flask_table class as an id
CLASSIFIER_STATUS = 'available'
CLASSIFIER_READY = True
elif 'Training' in [data['status'] for data in ALL_CLASSIFIERS.values()]:
CLASSIFIER_STATUS = 'training'
CLASSIFIER_READY = False
else:
CLASSIFIER_STATUS = 'unavailable'
CLASSIFIER_READY = False
return ALL_CLASSIFIERS, CLASSIFIER_STATUS, CLASSIFIER_READY
def _create_classifier():
# fetch all classifiers associated with the NLC instance
result = NLC_SERVICE.list_classifiers()
ALL_CLASSIFIERS = {}
for name, DATA_SET in REQ_CLASSIFIERS:
# initiate the dictionary storage for each instance
ALL_CLASSIFIERS[name] = {'id':'', 'status':''}
# find any instances which need training but havent been initiated
if name not in [result['classifiers'][i]['name'] for (i,x) in enumerate(result['classifiers'])]:
with open(os.path.join(data_folder, DATA_SET), 'rb') as training_data:
metadata = '{"name": "%s", "language": "en"}' % (name)
classifier = NLC_SERVICE.create_classifier(
metadata=metadata,
training_data=training_data
)
# store classifier information for future handling between the different instances
ALL_CLASSIFIERS[name]['id'] = classifier['classifier_id']
ALL_CLASSIFIERS[name]['status'] = classifier['status']
else:
# store classifier information for future handling between the different instances
ALL_CLASSIFIERS[name]['id'] = [result['classifiers'][i]['classifier_id'] for (i,x) in enumerate(result['classifiers']) if result['classifiers'][i]['name'] == name][0]
ALL_CLASSIFIERS[name]['status'] = NLC_SERVICE.get_classifier(ALL_CLASSIFIERS[name]['id'])['status']
return ALL_CLASSIFIERS
def _classify(input_text):
# send the text to the first classifier, get high level classification which determines which other classifiers the text is passed to
classifier_output_0 = NLC_SERVICE.classify(ALL_CLASSIFIERS['Product_description_Top_Level']['id'], input_text)['classes'][:2]
classifier_output_0 = [{'class_1':classifier_output_0[0]['class_name'], 'confidence_1':classifier_output_0[0]['confidence'], 'class_2':classifier_output_0[1]['class_name'], 'confidence_2':classifier_output_0[1]['confidence']}]
# top level classification of clothing points to this classifier specifically trained on that domain
if classifier_output_0[0]['class_1'] == 'Apparel-Clothing':
# initial classification used first determine target gender
classifier_output_1 = NLC_SERVICE.classify(ALL_CLASSIFIERS['Product_description_Gender']['id'], input_text)['classes'][:2]
classifier_output_1 = [{'class_1':classifier_output_1[0]['class_name'], 'confidence_1':classifier_output_1[0]['confidence'], 'class_2':classifier_output_1[1]['class_name'], 'confidence_2':classifier_output_1[1]['confidence']}]
# extra level of classification used first determine product specifics
classifier_output_2 = NLC_SERVICE.classify(ALL_CLASSIFIERS['Product_description_Clothing']['id'], input_text)['classes'][:2]
classifier_output_2 = [{'class_1':classifier_output_2[0]['class_name'], 'confidence_1':classifier_output_2[0]['confidence'], 'class_2':classifier_output_2[1]['class_name'], 'confidence_2':classifier_output_2[1]['confidence']}]
# top level classification of fashion accessories, which are tougher to determine target gender, points to this classifier specifically trained on that domain
elif classifier_output_0[0]['class_1'] == 'Apparel-Accessories':
classifier_output_1 = NLC_SERVICE.classify(ALL_CLASSIFIERS['Product_description_Apparel']['id'], input_text)['classes'][:2]
classifier_output_1 = [{'class_1':classifier_output_1[0]['class_name'], 'confidence_1':classifier_output_1[0]['confidence'], 'class_2':classifier_output_1[1]['class_name'], 'confidence_2':classifier_output_1[1]['confidence']}]
classifier_output_2 = {}
# top level classification of electronic and automotive products points to this classifier specifically trained on that domain
elif classifier_output_0[0]['class_1'].split('-')[0] == 'Electronics':
classifier_output_1 = NLC_SERVICE.classify(ALL_CLASSIFIERS['Product_description_Electronics']['id'], input_text)['classes'][:2]
classifier_output_1 = [{'class_1':classifier_output_1[0]['class_name'], 'confidence_1':classifier_output_1[0]['confidence'], 'class_2':classifier_output_1[1]['class_name'], 'confidence_2':classifier_output_1[1]['confidence']}]
classifier_output_2 = {}
# top level classification of health, beauty and fitness products points to this classifier specifically trained on that domain
elif classifier_output_0[0]['class_1'].split('_')[0] == 'Health':
classifier_output_1 = NLC_SERVICE.classify(ALL_CLASSIFIERS['Product_description_Health']['id'], input_text)['classes'][:2]
classifier_output_1 = [{'class_1':classifier_output_1[0]['class_name'], 'confidence_1':classifier_output_1[0]['confidence'], 'class_2':classifier_output_1[1]['class_name'], 'confidence_2':classifier_output_1[1]['confidence']}]
classifier_output_2 = {}
# top level classification of home goods points to this classifier specifically trained on that domain
elif classifier_output_0[0]['class_1'].split('-')[0] == 'Home':
classifier_output_1 = NLC_SERVICE.classify(ALL_CLASSIFIERS['Product_description_Home']['id'], input_text)['classes'][:2]
classifier_output_1 = [{'class_1':classifier_output_1[0]['class_name'], 'confidence_1':classifier_output_1[0]['confidence'], 'class_2':classifier_output_1[1]['class_name'], 'confidence_2':classifier_output_1[1]['confidence']}]
classifier_output_2 = {}
# top two choices for each instance used, along with the corresponding confidence
full_output = [k[0] for k in [classifier_output_0, classifier_output_1, classifier_output_2] if k != {}]
return full_output
def _capitalize(word):
# formatting for a mistake in assembling the training data
full_word = []
for peice in word.split('-'):
if len(peice) == 1:
full_word.append(peice.upper())
else:
full_word.append(peice[0].upper()+peice[1:].lower())
return '-'.join(full_word)
def _error_alerts(details, where, severity):
global ALERT_USERNAME
global ALERT_PASSWORD
# sent failure alert if service in use
if ALERT_USERNAME != '' and ALERT_PASSWORD != '':
message = "{ \"What\": \"%s\", \"Where\": \"%s\", \"Severity\": \"%s\", \"When\": \"%s\"}" % (details, where, severity, datetime.datetime.now())
requests.post("https://ans-us-south.opsmgmt.bluemix.net/api/alerts/v1", auth=(_config.alert_user, _config.alert_password), headers = {"Content-Type": "application/json", "accept": "application/json"}, data = message)
def _get_Kohls_url_info(url):
# parse passed url
# check if valid product description
if url[8:34] == 'www.kohls.com/product/prd-':
# extract product_id
prd_id = url.split('/prd-')[1].split('/')[0]
raw_desc = []
# loop to handle missed connections
while raw_desc == []:
# retrieve page info
pageContent=requests.get(url)
# convert to html
tree = html.fromstring(pageContent.content)
# parse html using xpath
raw_desc = tree.xpath('//*[@id="%s_productDetails"]/div/descendant::*/text()' % (prd_id))
# extract product description
desc = ' '.join([i for i in raw_desc if i not in ['PRODUCT FEATURES', '\r', '\n']])
while len(desc.split(' ')) > 1:
desc = desc.replace(' ', ' ')
escapes = ''.join([chr(char) for char in range(1, 32)])
# check if unicode
if type(desc) is unicode:
# unicode.translate() different than string
desc = desc.translate({ord(c): None for c in escapes})
desc = desc[:1000]
desc = ' '.join(desc.split(' ')[:120])
elif type(desc) is str:
desc = desc.translate(None, escapes)
desc = desc[:1000]
desc = ' '.join(desc.split(' ')[:120])
else:
desc = ''
return desc
else:
return False
port = os.getenv('PORT', '5000')
if __name__ == "__main__":
app.run(host='0.0.0.0', port=int(port))