-
Notifications
You must be signed in to change notification settings - Fork 83
/
extract_content_from_website.py
32 lines (24 loc) · 1.11 KB
/
extract_content_from_website.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
from langchain_community.document_loaders import WebBaseLoader
def extract_website_content(url):
"""
Extracts and cleans the main content from a given website URL.
Args:
url (str): The URL of the website from which to extract content.
Returns:
str: The first 4000 characters of the cleaned main content if it is sufficiently long, otherwise an empty string.
"""
try:
clean_text = []
loader = WebBaseLoader(url)
data = loader.load()
# Aggregate content using a list to avoid inefficient string concatenation in the loop
for doc in data:
if doc.page_content: # Check if page_content is not None or empty
clean_text.append(doc.page_content.replace("\n", ""))
# Join all parts into a single string after processing
clean_text = "".join(clean_text)
# Return up to the first 4000 characters if the content is sufficiently long
return clean_text[:4000] if len(clean_text) > 200 else ""
except Exception as error:
print('Error extracting main content:', error)
return ""