This repository has been archived by the owner on Dec 31, 2024. It is now read-only.
-
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrawl.py
132 lines (111 loc) · 5.47 KB
/
crawl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import hashlib
import os
import threading
import re
from urllib import response
import requests
from bs4 import BeautifulSoup
from collections import deque
from urllib.parse import urljoin, urlparse
from manager.call import manager_insert_data, manager_get_id, manager_remove_data, manager_edit_data
def summarize_text(text, max_length=174):
if len(text) <= max_length:
return text
else:
last_space_index = text.rfind(' ', 0, max_length)
return text[:last_space_index] + '...'
def classify_website(url):
image_extensions = ['.png', '.jpg', '.jpeg', '.gif', '.bmp', '.svg']
if any(url.endswith(ext) for ext in image_extensions):
return 'Image'
elif 'youtube.com/watch' in url:
return 'Video'
else:
return 'Text'
def get_website_info(url, headers, type):
try:
response = requests.get(url, headers=headers)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
title = soup.title.string.strip() if soup.title else ''
tags_to_extract = ['p', 'li', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'span', 'strong', 'em', 'blockquote', 'cite', 'q', 'dfn', 'abbr', 'time', 'code', 'var', 'samp', 'kbd', 'sub', 'sup', 'i', 'b', 'u', 'mark', 'small', 'del', 'ins', 's']
text_content = ' '.join([tag.get_text().strip() for tag in soup.find_all() if tag.name in tags_to_extract])
description = soup.select_one('meta[name="description"]')['content'] if soup.select_one('meta[name="description"]') else ''
keywords = soup.select_one('meta[name="keywords"]')['content'] if soup.select_one('meta[name="keywords"]') else ''
if type == 'Video':
youtube_id_match = re.search(r'youtube\.com/watch\?v=([^&]*)', url)
youtube_short_id_match = re.search(r'youtu\.be/([^&]*)', url)
if youtube_id_match:
description = youtube_id_match.group(1)
elif youtube_short_id_match:
description = youtube_short_id_match.group(1)
return {
"title": title,
"text_content": text_content,
"description": description,
"keywords": keywords
}
except Exception as e:
print(f"Error when getting website info from {url}: {e}")
return None
def add_to_crawl_list(url):
with open("./crawl.txt", "a+", encoding='utf-8') as crawl_list:
crawl_list.seek(0)
lines = crawl_list.readlines()
if any(url in line for line in lines):
return "The request already exists in the list."
try:
result = urlparse(url)
is_valid = all([result.scheme, result.netloc])
except ValueError:
is_valid = False
if not is_valid:
return "Your url is invalid."
crawl_list.write(url + '\n')
return "Your request has been successfully added to the list."
def load_to_deque(thread_id):
with open(f"./{thread_id}.txt", 'r') as file:
lines = file.readlines()
return deque(lines)
def ATMT(thread_id, username="", password=""):
print(f"Thread {thread_id} is running...")
os.rename('crawl.txt', f'{thread_id}.txt')
password = hashlib.md5(hashlib.sha256(password.encode('utf-8')).hexdigest().encode()).hexdigest()
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107 Safari/537.36'
headers = {'User-Agent': user_agent}
investigation_list = load_to_deque(thread_id)
checked_urls = set()
while investigation_list:
url = investigation_list.popleft()
checked_urls.add(url)
print("Investigating url: ", url)
type = classify_website(url)
website_info = get_website_info(url, headers, type)
if website_info is not None:
print("Title: ", website_info["title"])
result = manager_insert_data(type, username, password, url, website_info["title"], website_info["text_content"], website_info["description"], website_info["keywords"], summarize_text(website_info["text_content"]))
if result == "Content already exists in the database.":
site_id = manager_get_id(type, url)
print(manager_edit_data(type, username, password, site_id, url, website_info["title"], website_info["text_content"], website_info["description"], website_info["keywords"], summarize_text(website_info["text_content"])))
else:
print(result)
print("---------")
soup = BeautifulSoup(response.text, 'html.parser')
a_links = [a['href'] for a in soup.select('a[href]') if a['href']]
img_links = [img['src'] for img in soup.select('img[src]') if img['src']]
video_links = [video['src'] for video in soup.select('video[src]') if video['src']]
all_links = a_links + img_links + video_links
for link in all_links:
new_url = urljoin(url, link)
if new_url not in investigation_list and new_url not in checked_urls:
investigation_list.append(new_url)
else:
site_id = manager_get_id(type, url)
if site_id is not None:
print(manager_remove_data(type, username, password, site_id))
os.remove(f"./{thread_id}.txt")
def check_and_create_thread():
if os.path.exists('crawl.txt'):
thread_id = threading.active_count()
new_thread = threading.Thread(target=ATMT, args=(thread_id,))
new_thread.start()