This repository has been archived by the owner on Oct 15, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 5
/
text.py
126 lines (98 loc) · 3.33 KB
/
text.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import math
from itertools import repeat, chain
from typing import NamedTuple
from bs4 import BeautifulSoup
from newspaper import fulltext
from newspaper.cleaners import DocumentCleaner
from newspaper.configuration import Configuration
from newspaper.extractors import ContentExtractor
from nltk.tokenize import sent_tokenize
from sentence_transformers import SentenceTransformer, util
import settings
model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')
class Chunk(NamedTuple):
text: str
similarity: float
link: str
title: str
def strip_html(text):
soup = BeautifulSoup(text, 'html.parser')
for e in soup.find_all():
if e.name in ['script', 'head', 'style', 'aside', 'footer', 'header', 'nav', 'img', 'svg', 'button', 'form']:
e.extract()
return str(soup)
def get_text(text):
text = strip_html(text)
try:
text = fulltext(text, language=settings.LANGUAGE)
except (AttributeError, TypeError, UnicodeDecodeError):
text = None
return text
def get_title(text):
title = None
try:
config = Configuration()
config.language = settings.LANGUAGE
extractor = ContentExtractor(config)
document_cleaner = DocumentCleaner(config)
doc = config.get_parser().fromstring(text)
doc = document_cleaner.clean(doc)
title = extractor.get_title(doc)
except (AttributeError, TypeError):
pass
return title
def split_words(text):
return text.split()
def split_sentences(text):
return sent_tokenize(text)
def split_chunks(text):
sents = [i for i in split_sentences(text)]
chunks = []
chunk = ""
chunk_len = 0
for sent in sents:
sent_len = len(split_words(sent))
if sent_len + chunk_len < settings.CHUNK_MAX_LENGTH:
chunk += " " + sent
chunk_len += sent_len
else:
chunks.append(chunk)
chunk = sent
chunk_len = sent_len
if chunk_len > settings.CHUNK_MIN_LENGTH:
chunks.append(chunk)
return chunks
def find_likely_chunk(link, query_doc):
if not link.html:
return None
title = get_title(link.html)
text = get_text(link.html)
if not title or not text:
return None
chunks = []
chunked = split_chunks(text)
if len(chunked) == 0:
return None
chunk_docs = model.encode(chunked)
similarities = util.cos_sim(query_doc, chunk_docs)[0].tolist()
for (chunk, similarity) in zip(chunked, similarities):
chunks.append(Chunk(chunk, similarity, link.link, title))
sorted_chunks = sorted(chunks, key=lambda x: x.similarity, reverse=True)
return sorted_chunks[:max(1, math.floor(settings.CHUNK_LIMIT / 2))]
def filter_list(links):
flat_links = []
filtered_links = []
for link in links:
if link.link not in flat_links:
filtered_links.append(link)
flat_links.append(link.link)
return filtered_links
def find_likely_chunks(links, query):
links = filter_list(links)
query_doc = model.encode(query)
chunks = map(find_likely_chunk, links, repeat(query_doc))
chunks = [c for c in chunks if c]
chunks = list(chain.from_iterable(chunks))
chunks = [c for c in chunks if c]
sorted_chunks = sorted(chunks, key=lambda x: x.similarity, reverse=True)
return sorted_chunks[:settings.CHUNK_LIMIT]