-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathfilter_candidates_DT.py
31 lines (19 loc) · 1.05 KB
/
filter_candidates_DT.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
from elasticsearch import Elasticsearch
import re
ES_HOST = {"host" : "localhost", "port" : 9200}
INDEX_NAME = 'dt-index'
es = Elasticsearch(hosts = [ES_HOST], timeout=300)
def filter(comparison_object, candidates):
similarities_comparison_object = get_all_similarities(comparison_object)
filtered_candidates = []
for candidate in candidates:
# print('---', candidate, '---')
if any(candidate[0] == s or candidate[0] == re.sub('[^a-zA-Z0-9 ]', '', s) for s in similarities_comparison_object):
filtered_candidates.append(candidate[0])
# print([s for s in similarities_comparison_object if candidate[0] == s])
return filtered_candidates[0:10]
def get_all_similarities(comparison_object):
# print(comparison_object)
res = es.search(index = INDEX_NAME, size=10000, body={"query": {"match": {"first": comparison_object}}})
# similar_words = [re.sub('[^a-zA-Z0-9 ]', '', hit['_source']['second']) for hit in res['hits']['hits']]
return [hit['_source']['second'] for hit in res['hits']['hits']]